|
20 | 20 | from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
21 | 21 | from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
22 | 22 | from .deep_crawling import DeepCrawlStrategy
|
| 23 | +from .table_extraction import TableExtractionStrategy, DefaultTableExtraction |
23 | 24 |
|
24 | 25 | from .cache_context import CacheMode
|
25 | 26 | from .proxy_strategy import ProxyRotationStrategy
|
@@ -448,6 +449,10 @@ def __init__(
|
448 | 449 | self.chrome_channel = ""
|
449 | 450 | self.proxy = proxy
|
450 | 451 | self.proxy_config = proxy_config
|
| 452 | + if isinstance(self.proxy_config, dict): |
| 453 | + self.proxy_config = ProxyConfig.from_dict(self.proxy_config) |
| 454 | + if isinstance(self.proxy_config, str): |
| 455 | + self.proxy_config = ProxyConfig.from_string(self.proxy_config) |
451 | 456 |
|
452 | 457 |
|
453 | 458 | self.viewport_width = viewport_width
|
@@ -978,6 +983,8 @@ class CrawlerRunConfig():
|
978 | 983 | Default: False.
|
979 | 984 | table_score_threshold (int): Minimum score threshold for processing a table.
|
980 | 985 | Default: 7.
|
| 986 | + table_extraction (TableExtractionStrategy): Strategy to use for table extraction. |
| 987 | + Default: DefaultTableExtraction with table_score_threshold. |
981 | 988 |
|
982 | 989 | # Virtual Scroll Parameters
|
983 | 990 | virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
|
@@ -1104,6 +1111,7 @@ def __init__(
|
1104 | 1111 | image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
1105 | 1112 | image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
1106 | 1113 | table_score_threshold: int = 7,
|
| 1114 | + table_extraction: TableExtractionStrategy = None, |
1107 | 1115 | exclude_external_images: bool = False,
|
1108 | 1116 | exclude_all_images: bool = False,
|
1109 | 1117 | # Link and Domain Handling Parameters
|
@@ -1159,6 +1167,11 @@ def __init__(
|
1159 | 1167 | self.parser_type = parser_type
|
1160 | 1168 | self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
|
1161 | 1169 | self.proxy_config = proxy_config
|
| 1170 | + if isinstance(proxy_config, dict): |
| 1171 | + self.proxy_config = ProxyConfig.from_dict(proxy_config) |
| 1172 | + if isinstance(proxy_config, str): |
| 1173 | + self.proxy_config = ProxyConfig.from_string(proxy_config) |
| 1174 | + |
1162 | 1175 | self.proxy_rotation_strategy = proxy_rotation_strategy
|
1163 | 1176 |
|
1164 | 1177 | # Browser Location and Identity Parameters
|
@@ -1215,6 +1228,12 @@ def __init__(
|
1215 | 1228 | self.exclude_external_images = exclude_external_images
|
1216 | 1229 | self.exclude_all_images = exclude_all_images
|
1217 | 1230 | self.table_score_threshold = table_score_threshold
|
| 1231 | + |
| 1232 | + # Table extraction strategy (default to DefaultTableExtraction if not specified) |
| 1233 | + if table_extraction is None: |
| 1234 | + self.table_extraction = DefaultTableExtraction(table_score_threshold=table_score_threshold) |
| 1235 | + else: |
| 1236 | + self.table_extraction = table_extraction |
1218 | 1237 |
|
1219 | 1238 | # Link and Domain Handling Parameters
|
1220 | 1239 | self.exclude_social_media_domains = (
|
@@ -1486,6 +1505,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
1486 | 1505 | "image_score_threshold", IMAGE_SCORE_THRESHOLD
|
1487 | 1506 | ),
|
1488 | 1507 | table_score_threshold=kwargs.get("table_score_threshold", 7),
|
| 1508 | + table_extraction=kwargs.get("table_extraction", None), |
1489 | 1509 | exclude_all_images=kwargs.get("exclude_all_images", False),
|
1490 | 1510 | exclude_external_images=kwargs.get("exclude_external_images", False),
|
1491 | 1511 | # Link and Domain Handling Parameters
|
@@ -1594,6 +1614,7 @@ def to_dict(self):
|
1594 | 1614 | "image_description_min_word_threshold": self.image_description_min_word_threshold,
|
1595 | 1615 | "image_score_threshold": self.image_score_threshold,
|
1596 | 1616 | "table_score_threshold": self.table_score_threshold,
|
| 1617 | + "table_extraction": self.table_extraction, |
1597 | 1618 | "exclude_all_images": self.exclude_all_images,
|
1598 | 1619 | "exclude_external_images": self.exclude_external_images,
|
1599 | 1620 | "exclude_social_media_domains": self.exclude_social_media_domains,
|
|
0 commit comments