-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from zytedata/add-code
Initial functionality.
- Loading branch information
Showing
16 changed files
with
694 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[run] | ||
branch = true | ||
|
||
[report] | ||
# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 | ||
exclude_lines = | ||
pragma: no cover | ||
if TYPE_CHECKING: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
__version__ = "0.1.0" | ||
|
||
from .fingerprinter import Fingerprinter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from __future__ import annotations | ||
|
||
import logging | ||
import os | ||
from typing import TYPE_CHECKING, List, Union | ||
|
||
from scrapy import Request | ||
from scrapy.crawler import Crawler | ||
from scrapy.settings.default_settings import ( | ||
REQUEST_FINGERPRINTER_CLASS as ScrapyRequestFingerprinter, | ||
) | ||
from scrapy.utils.misc import create_instance, load_object | ||
from scrapy.utils.request import RequestFingerprinterProtocol | ||
|
||
from .url_canonicalizer import UrlCanonicalizer | ||
|
||
if TYPE_CHECKING: | ||
# typing.Self requires Python 3.11 | ||
from typing_extensions import Self | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Fingerprinter: | ||
def __init__(self, crawler: Crawler): | ||
self.crawler: Crawler = crawler | ||
rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist( | ||
"DUD_LOAD_RULE_PATHS" | ||
) | ||
if not rule_paths: | ||
logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.") | ||
self._fallback_request_fingerprinter: RequestFingerprinterProtocol = ( | ||
create_instance( | ||
load_object( | ||
crawler.settings.get( | ||
"DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS", | ||
ScrapyRequestFingerprinter, | ||
) | ||
), | ||
settings=crawler.settings, | ||
crawler=crawler, | ||
) | ||
) | ||
self.url_canonicalizer = UrlCanonicalizer(rule_paths) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler) -> Self: | ||
return cls(crawler) | ||
|
||
def fingerprint(self, request: Request) -> bytes: | ||
if not request.meta.get("dud", True): | ||
self.crawler.stats.inc_value("duplicate_url_discarder/request/skipped") | ||
return self._fallback_request_fingerprinter.fingerprint(request) | ||
canonical_url = self.url_canonicalizer.process_url(request.url) | ||
self.crawler.stats.inc_value("duplicate_url_discarder/request/processed") | ||
return self._fallback_request_fingerprinter.fingerprint( | ||
request.replace(url=canonical_url) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from typing import Dict, Type | ||
|
||
from ..rule import UrlRule | ||
from .base import UrlProcessorBase | ||
from .query_removal import QueryRemovalProcessor | ||
|
||
_PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = { | ||
"queryRemoval": QueryRemovalProcessor, | ||
} | ||
|
||
|
||
def get_processor(rule: UrlRule) -> UrlProcessorBase: | ||
processor_cls: Type[UrlProcessorBase] | ||
if rule.processor not in _PROCESSOR_CLASSES: | ||
raise ValueError(f"No URL processor named {rule.processor}") | ||
processor_cls = _PROCESSOR_CLASSES[rule.processor] | ||
return processor_cls(rule.args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Any, Tuple | ||
|
||
|
||
class UrlProcessorBase(ABC): | ||
def __init__(self, args: Tuple[Any, ...]): | ||
self.args: Tuple[Any, ...] = args | ||
self.validate_args() | ||
|
||
def validate_args(self) -> None: # noqa: B027 | ||
"""Check that the processor arguments are valid, raise an exception if not.""" | ||
pass | ||
|
||
@abstractmethod | ||
def process(self, input_url: str) -> str: | ||
"""Return the input URL, modified according to the rules.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from w3lib.url import url_query_cleaner | ||
|
||
from .base import UrlProcessorBase | ||
|
||
|
||
class QueryRemovalProcessor(UrlProcessorBase): | ||
def validate_args(self) -> None: | ||
for arg in self.args: | ||
if not isinstance(arg, str): | ||
raise TypeError( | ||
f"queryRemoval args must be strings, not {type(arg)}: {arg}" | ||
) | ||
|
||
def process(self, input_url: str) -> str: | ||
return url_query_cleaner( | ||
input_url, self.args, remove=True, unique=False, keep_fragments=True | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from __future__ import annotations | ||
|
||
import json | ||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple | ||
|
||
from url_matcher import Patterns | ||
|
||
if TYPE_CHECKING: | ||
# typing.Self requires Python 3.11 | ||
from typing_extensions import Self | ||
|
||
|
||
@dataclass(frozen=True) | ||
class UrlRule: | ||
order: int | ||
url_pattern: Patterns | ||
processor: str | ||
args: Tuple[Any, ...] | ||
|
||
@classmethod | ||
def from_dict(cls, rule_dict: Dict[str, Any]) -> Self: | ||
"""Load a rule from a dict""" | ||
return cls( | ||
order=rule_dict["order"], | ||
url_pattern=Patterns(**rule_dict["urlPattern"]), | ||
processor=rule_dict["processor"], | ||
args=tuple(rule_dict.get("args") or ()), | ||
) | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
"""Save a rule to a dict""" | ||
pattern = {"include": list(self.url_pattern.include)} | ||
if self.url_pattern.exclude: | ||
pattern["exclude"] = list(self.url_pattern.exclude) | ||
result = { | ||
"order": self.order, | ||
"urlPattern": pattern, | ||
"processor": self.processor, | ||
} | ||
if self.args: | ||
result["args"] = list(self.args) | ||
return result | ||
|
||
|
||
def load_rules(data: str) -> List[UrlRule]: | ||
"""Load a list of rules from a JSON text.""" | ||
return [UrlRule.from_dict(item) for item in json.loads(data)] | ||
|
||
|
||
def save_rules(rules: List[UrlRule]) -> str: | ||
"""Save a list of rules to a JSON text.""" | ||
return json.dumps( | ||
[r.to_dict() for r in rules], | ||
ensure_ascii=False, | ||
sort_keys=True, | ||
indent=2, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import logging | ||
import operator | ||
import os | ||
from pathlib import Path | ||
from typing import Dict, Iterable, Set, Union | ||
|
||
from url_matcher import URLMatcher | ||
|
||
from .processors import UrlProcessorBase, get_processor | ||
from .rule import UrlRule, load_rules | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class UrlCanonicalizer: | ||
def __init__(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None: | ||
rules: Set[UrlRule] = set() | ||
full_rule_count = 0 | ||
for rule_path in rule_paths: | ||
data = Path(rule_path).read_text() | ||
loaded_rules = load_rules(data) | ||
full_rule_count += len(loaded_rules) | ||
rules.update(loaded_rules) | ||
rule_count = len(rules) | ||
logger.info( | ||
f"Loaded {rule_count} rules, skipped {full_rule_count - rule_count} duplicates." | ||
) | ||
|
||
self.url_matcher = URLMatcher() | ||
self.processors: Dict[int, UrlProcessorBase] = {} | ||
rule_id = 0 | ||
for rule in sorted(rules, key=operator.attrgetter("order")): | ||
processor = get_processor(rule) | ||
self.processors[rule_id] = processor | ||
self.url_matcher.add_or_update(rule_id, rule.url_pattern) | ||
rule_id += 1 | ||
|
||
def process_url(self, url: str) -> str: | ||
use_universal = True | ||
for rule_id in self.url_matcher.match_all(url, include_universal=False): | ||
use_universal = False | ||
processor = self.processors[rule_id] | ||
url = processor.process(url) | ||
if use_universal: | ||
for rule_id in self.url_matcher.match_universal(): | ||
processor = self.processors[rule_id] | ||
url = processor.process(url) | ||
return url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.