Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions crawl4ai/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def get_home_folder():
def load_bert_base_uncased():
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", resume_download=None)
model = BertModel.from_pretrained("bert-base-uncased", resume_download=None)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()
model, device = set_model_device(model)
return tokenizer, model
Expand All @@ -94,8 +94,8 @@ def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
"""
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
model = AutoModel.from_pretrained(model_name, resume_download=None)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
model, device = set_model_device(model)
return tokenizer, model
Expand Down Expand Up @@ -134,10 +134,8 @@ def load_text_multilabel_classifier():
# # return load_spacy_model(), torch.device("cpu")

MODEL = "cardiffnlp/tweet-topic-21-multi"
tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL, resume_download=None
)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()
model, device = set_model_device(model)
class_mapping = model.config.id2label
Expand Down
92 changes: 46 additions & 46 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools>=64.0.0", "wheel"]
requires = ["setuptools>=81.0.0", "wheel>=0.45.1"]
build-backend = "setuptools.build_meta"

[project]
Expand All @@ -13,40 +13,40 @@ authors = [
{name = "Unclecode", email = "unclecode@kidocode.com"}
]
dependencies = [
"aiofiles>=24.1.0",
"aiohttp>=3.11.11",
"aiosqlite~=0.20",
"anyio>=4.0.0",
"lxml~=5.3",
"aiofiles>=25.1.0",
"aiohttp>=3.14.1",
"aiosqlite>=0.22.1",
"anyio>=4.14.1",
"lxml>=6.1.1",
"unclecode-litellm==1.81.13",
"numpy>=1.26.0,<3",
"pillow>=10.4",
"playwright>=1.49.0",
"patchright>=1.49.0",
"python-dotenv~=1.0",
"requests~=2.26",
"beautifulsoup4~=4.12",
"playwright-stealth>=2.0.0",
"xxhash~=3.4",
"rank-bm25~=0.2",
"snowballstemmer~=2.2",
"pydantic>=2.10",
"pyOpenSSL>=25.3.0",
"psutil>=6.1.1",
"PyYAML>=6.0",
"nltk>=3.9.1",
"rich>=13.9.4",
"cssselect>=1.2.0",
"httpx>=0.27.2",
"httpx[http2]>=0.27.2",
"fake-useragent>=2.0.3",
"click>=8.1.7",
"chardet>=5.2.0",
"brotli>=1.1.0",
"humanize>=4.10.0",
"lark>=1.2.2",
"numpy>=2.2.6,<3",
"pillow>=12.3.0",
"playwright>=1.61.0",
"patchright>=1.61.1",
"python-dotenv>=1.2.2",
"requests>=2.34.2",
"beautifulsoup4>=4.15.0",
"playwright-stealth>=2.0.3",
"xxhash>=3.8.0",
"rank-bm25>=0.2.2",
"snowballstemmer>=3.1.1",
"pydantic>=2.13.4",
"pyOpenSSL>=26.3.0",
"psutil>=7.2.2",
"PyYAML>=6.0.3",
"nltk>=3.9.4",
"rich>=15.0.0",
"cssselect>=1.4.0",
"httpx[http2]>=0.28.1",
"fake-useragent>=2.2.0",
"click>=8.4.2",
"chardet>=7.4.3",
"brotli>=1.2.0",
"humanize>=4.16.0",
"lark>=1.3.1",
"alphashape>=1.3.1",
"shapely>=2.0.0"
"shapely>=2.1.2",
"pdf2image>=1.17.0"
]
classifiers = [
"Development Status :: 4 - Beta",
Expand All @@ -59,20 +59,20 @@ classifiers = [
]

[project.optional-dependencies]
pdf = ["pypdf"]
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"]
pdf = ["pypdf>=6.14.2"]
torch = ["torch>=2.12.1", "nltk>=3.9.4", "scikit-learn>=1.7.2"]
transformer = ["transformers>=5.12.1", "tokenizers>=0.22.2", "sentence-transformers>=5.6.0"]
cosine = ["torch>=2.12.1", "transformers>=5.12.1", "nltk>=3.9.4", "sentence-transformers>=5.6.0"]
sync = ["selenium>=4.45.0"]
all = [
"pypdf",
"torch",
"nltk",
"scikit-learn",
"transformers",
"tokenizers",
"sentence-transformers",
"selenium"
"pypdf>=6.14.2",
"torch>=2.12.1",
"nltk>=3.9.4",
"scikit-learn>=1.7.2",
"transformers>=5.12.1",
"tokenizers>=0.22.2",
"sentence-transformers>=5.6.0",
"selenium>=4.45.0"
]

[project.scripts]
Expand Down
62 changes: 32 additions & 30 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,36 +1,38 @@
# Note: These requirements are also specified in pyproject.toml
# This file is kept for development environment setup and compatibility
aiofiles>=24.1.0
aiohttp>=3.11.11
aiosqlite~=0.20
anyio>=4.0.0
lxml~=5.3
aiofiles>=25.1.0
aiohttp>=3.14.1
aiosqlite>=0.22.1
anyio>=4.14.1
lxml>=6.1.1
unclecode-litellm==1.81.13
numpy>=1.26.0,<3
pillow>=10.4
playwright>=1.49.0
patchright>=1.49.0
python-dotenv~=1.0
requests~=2.26
beautifulsoup4~=4.12
playwright-stealth>=2.0.0
xxhash~=3.4
rank-bm25~=0.2
colorama~=0.4
snowballstemmer~=2.2
pydantic>=2.10
pyOpenSSL>=25.3.0
psutil>=6.1.1
PyYAML>=6.0
nltk>=3.9.1
rich>=13.9.4
cssselect>=1.2.0
chardet>=5.2.0
brotli>=1.1.0
httpx[http2]>=0.27.2
numpy>=2.2.6,<3
pillow>=12.3.0
playwright>=1.61.0
patchright>=1.61.1
python-dotenv>=1.2.2
requests>=2.34.2
beautifulsoup4>=4.15.0
playwright-stealth>=2.0.3
xxhash>=3.8.0
rank-bm25>=0.2.2
snowballstemmer>=3.1.1
pydantic>=2.13.4
pyOpenSSL>=26.3.0
psutil>=7.2.2
PyYAML>=6.0.3
nltk>=3.9.4
rich>=15.0.0
cssselect>=1.4.0
chardet>=7.4.3
brotli>=1.2.0
httpx[http2]>=0.28.1
alphashape>=1.3.1
shapely>=2.0.0

shapely>=2.1.2
fake-useragent>=2.2.0
click>=8.4.2
humanize>=4.16.0
lark>=1.3.1

pdf2image>=1.17.0
pypdf>=6.0.0
pypdf>=6.14.2
87 changes: 87 additions & 0 deletions tests/unit/test_model_loader_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import sys
from types import SimpleNamespace
from unittest.mock import Mock

import pytest

from crawl4ai import model_loader


class _FakePretrained:
from_pretrained = Mock()


@pytest.fixture(autouse=True)
def reset_pretrained_mock():
_FakePretrained.from_pretrained.reset_mock()


def _fake_model():
model = Mock()
model.config.id2label = {}
return model


def test_bert_loader_uses_current_from_pretrained_api(monkeypatch):
tokenizer = object()
model = _fake_model()
_FakePretrained.from_pretrained.side_effect = [tokenizer, model]
monkeypatch.setitem(
sys.modules,
"transformers",
SimpleNamespace(BertTokenizer=_FakePretrained, BertModel=_FakePretrained),
)
monkeypatch.setattr(model_loader, "set_model_device", lambda value: (value, "cpu"))

loaded_tokenizer, loaded_model = model_loader.load_bert_base_uncased()

assert (loaded_tokenizer, loaded_model) == (tokenizer, model)
assert _FakePretrained.from_pretrained.call_args_list == [
(("bert-base-uncased",), {}),
(("bert-base-uncased",), {}),
]


def test_embedding_loader_uses_current_from_pretrained_api(monkeypatch):
tokenizer = object()
model = _fake_model()
_FakePretrained.from_pretrained.side_effect = [tokenizer, model]
monkeypatch.setitem(
sys.modules,
"transformers",
SimpleNamespace(AutoTokenizer=_FakePretrained, AutoModel=_FakePretrained),
)
monkeypatch.setattr(model_loader, "set_model_device", lambda value: (value, "cpu"))

loaded_tokenizer, loaded_model = model_loader.load_HF_embedding_model("example/model")

assert (loaded_tokenizer, loaded_model) == (tokenizer, model)
assert _FakePretrained.from_pretrained.call_args_list == [
(("example/model",), {}),
(("example/model",), {}),
]


def test_multilabel_loader_uses_current_from_pretrained_api(monkeypatch):
tokenizer = object()
model = _fake_model()
_FakePretrained.from_pretrained.side_effect = [tokenizer, model]
monkeypatch.setitem(
sys.modules,
"transformers",
SimpleNamespace(
AutoTokenizer=_FakePretrained,
AutoModelForSequenceClassification=_FakePretrained,
),
)
monkeypatch.setitem(sys.modules, "torch", SimpleNamespace())
monkeypatch.setattr(model_loader, "set_model_device", lambda value: (value, "cpu"))

classifier, device = model_loader.load_text_multilabel_classifier()

assert callable(classifier)
assert device == "cpu"
assert _FakePretrained.from_pretrained.call_args_list == [
(("cardiffnlp/tweet-topic-21-multi",), {}),
(("cardiffnlp/tweet-topic-21-multi",), {}),
]
Loading