Merge pull request #4 from zytedata/add-code

Initial functionality.
zytedata · May 9, 2024 · 810eb02 · 810eb02
2 parents 541dd97 + ac6421a
commit 810eb02
Show file tree

Hide file tree

Showing 16 changed files with 694 additions and 4 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+branch = true
+
+[report]
+# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
+exclude_lines =
+    pragma: no cover
+    if TYPE_CHECKING:
diff --git a/.flake8 b/.flake8
@@ -4,6 +4,10 @@ ignore =
     E203,
     # line too long
     E501,
+    # multiple statements on one line
+    E704,
+    # line break before binary operator
+    W503,
 
     # Missing docstring in public module
     D100,
@@ -21,6 +25,8 @@ ignore =
     D107,
     # One-line docstring should fit on one line with quotes
     D200,
+    # No blank lines allowed after function docstring
+    D202,
     # 1 blank line required between summary line and description
     D205,
     # Multi-line docstring closing quotes should be on a separate line

diff --git a/README.rst b/README.rst
@@ -22,9 +22,8 @@ duplicate-url-discarder
    :target: https://duplicate-url-discarder.readthedocs.io/en/stable/?badge=stable
    :alt: Documentation Status
 
-``duplicate-url-discarder`` contains Scrapy components that allow discarding
-requests with duplicate URLs, using customizable policies to configure which
-URLs are considered duplicate.
+``duplicate-url-discarder`` contains a Scrapy fingerprinter that uses
+customizable URL processors to canonicalize URLs before fingerprinting.
 
 Quick Start
 ***********
@@ -37,3 +36,87 @@ Installation
     pip install duplicate-url-discarder
 
 Requires **Python 3.8+**.
+
+Using
+=====
+
+Enable the Scrapy fingerprinter:
+
+.. code-block:: python
+
+    REQUEST_FINGERPRINTER_CLASS = "duplicate_url_discarder.Fingerprinter"
+
+It will make fingerprints using canonical forms of the request URLs. Requests
+with the ``"dud"`` meta value set to ``False`` are processed using a fallback
+fingerprinter (which is the default Scrapy one unless another one is configured
+in the ``DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` setting).
+
+URL Processors
+==============
+
+``duplicate-url-discarder`` utilizes *URL processors* to make canonical
+versions of URLs. The processors are configured with *URL rules*. Each URL rule
+specifies an URL pattern for which the processor applies, and specific
+processor arguments to use.
+
+The following URL processors are currently available:
+
+* ``queryRemoval``: removes query string parameters *(i.e. key=value)*, wherein
+  the keys are specified in the arguments. If a given key appears multiple times
+  with different values in the URL, all of them are removed.
+
+URL Rules
+=========
+
+A URL rule is a dictionary specifying the ``url-matcher`` URL pattern(s), the
+URL processor name, the URL processor args and the order that is used to sort
+the rules. They are loaded from JSON files that contain arrays of serialized
+rules:
+
+.. code-block:: json
+
+    [
+      {
+        "args": [
+          "foo",
+          "bar",
+        ],
+        "order": 100,
+        "processor": "queryRemoval",
+        "urlPattern": {
+          "include": [
+            "foo.example"
+          ]
+        }
+      },
+      {
+        "args": [
+          "PHPSESSIONID"
+        ],
+        "order": 100,
+        "processor": "queryRemoval",
+        "urlPattern": {
+          "include": []
+        }
+      }
+    ]
+
+All non-universal rules (ones that have non-empty include pattern) that match
+a request URL are applied according to their order field. If there are no
+non-universal rules that match the URL, the universal ones are applied.
+
+Configuration
+=============
+
+``duplicate-url-discarder`` uses the following Scrapy settings:
+
+``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
+``pathlib.Path``) pointing to JSON files with the URL rules to apply:
+
+.. code-block:: python
+
+    DUD_LOAD_RULE_PATHS = [
+        "/home/user/project/custom_rules1.json",
+    ]
+
+The default value of this setting is empty.
diff --git a/duplicate_url_discarder/__init__.py b/duplicate_url_discarder/__init__.py
@@ -1 +1,3 @@
 __version__ = "0.1.0"
+
+from .fingerprinter import Fingerprinter
diff --git a/duplicate_url_discarder/fingerprinter.py b/duplicate_url_discarder/fingerprinter.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import logging
+import os
+from typing import TYPE_CHECKING, List, Union
+
+from scrapy import Request
+from scrapy.crawler import Crawler
+from scrapy.settings.default_settings import (
+    REQUEST_FINGERPRINTER_CLASS as ScrapyRequestFingerprinter,
+)
+from scrapy.utils.misc import create_instance, load_object
+from scrapy.utils.request import RequestFingerprinterProtocol
+
+from .url_canonicalizer import UrlCanonicalizer
+
+if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
+    from typing_extensions import Self
+
+logger = logging.getLogger(__name__)
+
+
+class Fingerprinter:
+    def __init__(self, crawler: Crawler):
+        self.crawler: Crawler = crawler
+        rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
+            "DUD_LOAD_RULE_PATHS"
+        )
+        if not rule_paths:
+            logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.")
+        self._fallback_request_fingerprinter: RequestFingerprinterProtocol = (
+            create_instance(
+                load_object(
+                    crawler.settings.get(
+                        "DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS",
+                        ScrapyRequestFingerprinter,
+                    )
+                ),
+                settings=crawler.settings,
+                crawler=crawler,
+            )
+        )
+        self.url_canonicalizer = UrlCanonicalizer(rule_paths)
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler) -> Self:
+        return cls(crawler)
+
+    def fingerprint(self, request: Request) -> bytes:
+        if not request.meta.get("dud", True):
+            self.crawler.stats.inc_value("duplicate_url_discarder/request/skipped")
+            return self._fallback_request_fingerprinter.fingerprint(request)
+        canonical_url = self.url_canonicalizer.process_url(request.url)
+        self.crawler.stats.inc_value("duplicate_url_discarder/request/processed")
+        return self._fallback_request_fingerprinter.fingerprint(
+            request.replace(url=canonical_url)
+        )
diff --git a/duplicate_url_discarder/processors/__init__.py b/duplicate_url_discarder/processors/__init__.py
@@ -0,0 +1,17 @@
+from typing import Dict, Type
+
+from ..rule import UrlRule
+from .base import UrlProcessorBase
+from .query_removal import QueryRemovalProcessor
+
+_PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = {
+    "queryRemoval": QueryRemovalProcessor,
+}
+
+
+def get_processor(rule: UrlRule) -> UrlProcessorBase:
+    processor_cls: Type[UrlProcessorBase]
+    if rule.processor not in _PROCESSOR_CLASSES:
+        raise ValueError(f"No URL processor named {rule.processor}")
+    processor_cls = _PROCESSOR_CLASSES[rule.processor]
+    return processor_cls(rule.args)
diff --git a/duplicate_url_discarder/processors/base.py b/duplicate_url_discarder/processors/base.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import Any, Tuple
+
+
+class UrlProcessorBase(ABC):
+    def __init__(self, args: Tuple[Any, ...]):
+        self.args: Tuple[Any, ...] = args
+        self.validate_args()
+
+    def validate_args(self) -> None:  # noqa: B027
+        """Check that the processor arguments are valid, raise an exception if not."""
+        pass
+
+    @abstractmethod
+    def process(self, input_url: str) -> str:
+        """Return the input URL, modified according to the rules."""
diff --git a/duplicate_url_discarder/processors/query_removal.py b/duplicate_url_discarder/processors/query_removal.py
@@ -0,0 +1,17 @@
+from w3lib.url import url_query_cleaner
+
+from .base import UrlProcessorBase
+
+
+class QueryRemovalProcessor(UrlProcessorBase):
+    def validate_args(self) -> None:
+        for arg in self.args:
+            if not isinstance(arg, str):
+                raise TypeError(
+                    f"queryRemoval args must be strings, not {type(arg)}: {arg}"
+                )
+
+    def process(self, input_url: str) -> str:
+        return url_query_cleaner(
+            input_url, self.args, remove=True, unique=False, keep_fragments=True
+        )
diff --git a/duplicate_url_discarder/rule.py b/duplicate_url_discarder/rule.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+
+from url_matcher import Patterns
+
+if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
+    from typing_extensions import Self
+
+
+@dataclass(frozen=True)
+class UrlRule:
+    order: int
+    url_pattern: Patterns
+    processor: str
+    args: Tuple[Any, ...]
+
+    @classmethod
+    def from_dict(cls, rule_dict: Dict[str, Any]) -> Self:
+        """Load a rule from a dict"""
+        return cls(
+            order=rule_dict["order"],
+            url_pattern=Patterns(**rule_dict["urlPattern"]),
+            processor=rule_dict["processor"],
+            args=tuple(rule_dict.get("args") or ()),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Save a rule to a dict"""
+        pattern = {"include": list(self.url_pattern.include)}
+        if self.url_pattern.exclude:
+            pattern["exclude"] = list(self.url_pattern.exclude)
+        result = {
+            "order": self.order,
+            "urlPattern": pattern,
+            "processor": self.processor,
+        }
+        if self.args:
+            result["args"] = list(self.args)
+        return result
+
+
+def load_rules(data: str) -> List[UrlRule]:
+    """Load a list of rules from a JSON text."""
+    return [UrlRule.from_dict(item) for item in json.loads(data)]
+
+
+def save_rules(rules: List[UrlRule]) -> str:
+    """Save a list of rules to a JSON text."""
+    return json.dumps(
+        [r.to_dict() for r in rules],
+        ensure_ascii=False,
+        sort_keys=True,
+        indent=2,
+    )
diff --git a/duplicate_url_discarder/url_canonicalizer.py b/duplicate_url_discarder/url_canonicalizer.py
@@ -0,0 +1,48 @@
+import logging
+import operator
+import os
+from pathlib import Path
+from typing import Dict, Iterable, Set, Union
+
+from url_matcher import URLMatcher
+
+from .processors import UrlProcessorBase, get_processor
+from .rule import UrlRule, load_rules
+
+logger = logging.getLogger(__name__)
+
+
+class UrlCanonicalizer:
+    def __init__(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None:
+        rules: Set[UrlRule] = set()
+        full_rule_count = 0
+        for rule_path in rule_paths:
+            data = Path(rule_path).read_text()
+            loaded_rules = load_rules(data)
+            full_rule_count += len(loaded_rules)
+            rules.update(loaded_rules)
+        rule_count = len(rules)
+        logger.info(
+            f"Loaded {rule_count} rules, skipped {full_rule_count - rule_count} duplicates."
+        )
+
+        self.url_matcher = URLMatcher()
+        self.processors: Dict[int, UrlProcessorBase] = {}
+        rule_id = 0
+        for rule in sorted(rules, key=operator.attrgetter("order")):
+            processor = get_processor(rule)
+            self.processors[rule_id] = processor
+            self.url_matcher.add_or_update(rule_id, rule.url_pattern)
+            rule_id += 1
+
+    def process_url(self, url: str) -> str:
+        use_universal = True
+        for rule_id in self.url_matcher.match_all(url, include_universal=False):
+            use_universal = False
+            processor = self.processors[rule_id]
+            url = processor.process(url)
+        if use_universal:
+            for rule_id in self.url_matcher.match_universal():
+                processor = self.processors[rule_id]
+                url = processor.process(url)
+        return url
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "duplicate-url-discarder"
-description = "Discarding duplicate URLs based on policies."
+description = "Discarding duplicate URLs based on rules."
 authors = [{name = "Zyte Group Ltd", email = "info@zyte.com"}]
 readme = "README.rst"
 license = {file = "LICENSE"}
@@ -24,6 +24,9 @@ classifiers = [
 ]
 requires-python = ">=3.8"
 dependencies = [
+    "Scrapy >= 2.7.0",
+    "url-matcher >= 0.5.0",
+    "w3lib >= 1.22.0",
 ]
 dynamic = ["version"]
 
@@ -40,6 +43,13 @@ duplicate_url_discarder = ["py.typed"]
 profile = "black"
 multi_line_output = 3
 
+[[tool.mypy.overrides]]
+module = [
+    "scrapy.*",
+    "url_matcher.*",
+]
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = [
     "tests.*",