Skip to content

Commit e651e04

Browse files
committed
Release v0.7.4: Merge release branch
- Merge release/v0.7.4 into main - Version: 0.7.4 - Ready for tag and publication
2 parents f0ce7b2 + 5398acc commit e651e04

32 files changed

+5553
-728
lines changed

README.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@
2727

2828
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
2929

30-
[✨ Check out latest update v0.7.3](#-recent-updates)
30+
[✨ Check out latest update v0.7.4](#-recent-updates)
3131

32-
✨ New in v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
32+
✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
33+
34+
✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
3335

3436
<details>
3537
<summary>🤓 <strong>My Personal Story</strong></summary>
@@ -542,6 +544,40 @@ async def test_news_crawl():
542544

543545
## ✨ Recent Updates
544546

547+
<details>
548+
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
549+
550+
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables:
551+
```python
552+
from crawl4ai import LLMTableExtraction, LLMConfig
553+
554+
# Configure intelligent table extraction
555+
table_strategy = LLMTableExtraction(
556+
llm_config=LLMConfig(provider="openai/gpt-4.1-mini"),
557+
enable_chunking=True, # Handle massive tables
558+
chunk_token_threshold=5000, # Smart chunking threshold
559+
overlap_threshold=100, # Maintain context between chunks
560+
extraction_type="structured" # Get structured data output
561+
)
562+
563+
config = CrawlerRunConfig(table_extraction_strategy=table_strategy)
564+
result = await crawler.arun("https://complex-tables-site.com", config=config)
565+
566+
# Tables are automatically chunked, processed, and merged
567+
for table in result.tables:
568+
print(f"Extracted table: {len(table['data'])} rows")
569+
```
570+
571+
- **⚡ Dispatcher Bug Fix**: Fixed sequential processing bottleneck in arun_many for fast-completing tasks
572+
- **🧹 Memory Management Refactor**: Consolidated memory utilities into main utils module for cleaner architecture
573+
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation with thread-safe locking
574+
- **🔗 Advanced URL Processing**: Better handling of raw:// URLs and base tag link resolution
575+
- **🛡️ Enhanced Proxy Support**: Flexible proxy configuration supporting both dict and string formats
576+
577+
[Full v0.7.4 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
578+
579+
</details>
580+
545581
<details>
546582
<summary><strong>Version 0.7.3 Release Highlights - The Multi-Config Intelligence Update</strong></summary>
547583

crawl4ai/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@
2929
)
3030
from .chunking_strategy import ChunkingStrategy, RegexChunking
3131
from .markdown_generation_strategy import DefaultMarkdownGenerator
32+
from .table_extraction import (
33+
TableExtractionStrategy,
34+
DefaultTableExtraction,
35+
NoTableExtraction,
36+
LLMTableExtraction,
37+
)
3238
from .content_filter_strategy import (
3339
PruningContentFilter,
3440
BM25ContentFilter,
@@ -156,6 +162,9 @@
156162
"ChunkingStrategy",
157163
"RegexChunking",
158164
"DefaultMarkdownGenerator",
165+
"TableExtractionStrategy",
166+
"DefaultTableExtraction",
167+
"NoTableExtraction",
159168
"RelevantContentFilter",
160169
"PruningContentFilter",
161170
"BM25ContentFilter",

crawl4ai/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# crawl4ai/__version__.py
22

33
# This is the version that will be used for stable releases
4-
__version__ = "0.7.3"
4+
__version__ = "0.7.4"
55

66
# For nightly builds, this gets set during build process
77
__nightly_version__ = None

crawl4ai/async_configs.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
2121
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
2222
from .deep_crawling import DeepCrawlStrategy
23+
from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
2324

2425
from .cache_context import CacheMode
2526
from .proxy_strategy import ProxyRotationStrategy
@@ -448,6 +449,10 @@ def __init__(
448449
self.chrome_channel = ""
449450
self.proxy = proxy
450451
self.proxy_config = proxy_config
452+
if isinstance(self.proxy_config, dict):
453+
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
454+
if isinstance(self.proxy_config, str):
455+
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
451456

452457

453458
self.viewport_width = viewport_width
@@ -978,6 +983,8 @@ class CrawlerRunConfig():
978983
Default: False.
979984
table_score_threshold (int): Minimum score threshold for processing a table.
980985
Default: 7.
986+
table_extraction (TableExtractionStrategy): Strategy to use for table extraction.
987+
Default: DefaultTableExtraction with table_score_threshold.
981988
982989
# Virtual Scroll Parameters
983990
virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
@@ -1104,6 +1111,7 @@ def __init__(
11041111
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
11051112
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
11061113
table_score_threshold: int = 7,
1114+
table_extraction: TableExtractionStrategy = None,
11071115
exclude_external_images: bool = False,
11081116
exclude_all_images: bool = False,
11091117
# Link and Domain Handling Parameters
@@ -1159,6 +1167,11 @@ def __init__(
11591167
self.parser_type = parser_type
11601168
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
11611169
self.proxy_config = proxy_config
1170+
if isinstance(proxy_config, dict):
1171+
self.proxy_config = ProxyConfig.from_dict(proxy_config)
1172+
if isinstance(proxy_config, str):
1173+
self.proxy_config = ProxyConfig.from_string(proxy_config)
1174+
11621175
self.proxy_rotation_strategy = proxy_rotation_strategy
11631176

11641177
# Browser Location and Identity Parameters
@@ -1215,6 +1228,12 @@ def __init__(
12151228
self.exclude_external_images = exclude_external_images
12161229
self.exclude_all_images = exclude_all_images
12171230
self.table_score_threshold = table_score_threshold
1231+
1232+
# Table extraction strategy (default to DefaultTableExtraction if not specified)
1233+
if table_extraction is None:
1234+
self.table_extraction = DefaultTableExtraction(table_score_threshold=table_score_threshold)
1235+
else:
1236+
self.table_extraction = table_extraction
12181237

12191238
# Link and Domain Handling Parameters
12201239
self.exclude_social_media_domains = (
@@ -1486,6 +1505,7 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
14861505
"image_score_threshold", IMAGE_SCORE_THRESHOLD
14871506
),
14881507
table_score_threshold=kwargs.get("table_score_threshold", 7),
1508+
table_extraction=kwargs.get("table_extraction", None),
14891509
exclude_all_images=kwargs.get("exclude_all_images", False),
14901510
exclude_external_images=kwargs.get("exclude_external_images", False),
14911511
# Link and Domain Handling Parameters
@@ -1594,6 +1614,7 @@ def to_dict(self):
15941614
"image_description_min_word_threshold": self.image_description_min_word_threshold,
15951615
"image_score_threshold": self.image_score_threshold,
15961616
"table_score_threshold": self.table_score_threshold,
1617+
"table_extraction": self.table_extraction,
15971618
"exclude_all_images": self.exclude_all_images,
15981619
"exclude_external_images": self.exclude_external_images,
15991620
"exclude_social_media_domains": self.exclude_social_media_domains,

0 commit comments

Comments
 (0)