zytedata · wRAR · May 9, 2024 · Mar 25, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+branch = true
+
+[report]
+# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
+exclude_lines =
+    pragma: no cover
+    if TYPE_CHECKING:
diff --git a/.flake8 b/.flake8
@@ -4,6 +4,10 @@ ignore =
     E203,
     # line too long
     E501,
+    # multiple statements on one line
+    E704,
+    # line break before binary operator
+    W503,
 
     # Missing docstring in public module
     D100,
@@ -21,6 +25,8 @@ ignore =
     D107,
     # One-line docstring should fit on one line with quotes
     D200,
+    # No blank lines allowed after function docstring
+    D202,
     # 1 blank line required between summary line and description
     D205,
     # Multi-line docstring closing quotes should be on a separate line

diff --git a/README.rst b/README.rst
@@ -37,3 +37,37 @@ Installation
     pip install duplicate-url-discarder
 
 Requires **Python 3.8+**.
+
+Using
+=====
+
+Enable the Scrapy component:
+
+.. code-block:: python
+
+    ...
+
+It will process request URLs, making canonical forms of them and discarding
+URLs with the same canonical form as earlier ones.
+
+Policies
+========
+
+``duplicate-url-discarder`` utilizes *policies* to make canonical versions of
+URLs. The policies are configured with *URL rules*. Each URL rule specifies
+an URL pattern that a policy applies to and specific policy arguments to use.
+
+The following policies are currently available:
+
+* ``queryRemoval``: removes query string parameters *(i.e. key=value)*, wherein
+  the keys are specified in the arguments. If a given key appears multiple times
+  with different values in the URL, all of them are removed.
+
+Configuration
+=============
+
+``duplicate-url-discarder`` uses the following Scrapy settings:
+
+``DUD_LOAD_POLICY_PATH``: it should be a list of file paths (``str`` or
+``pathlib.Path``) pointing to files with the URL rules to apply. The default
+value of this setting points to the default rules file.
diff --git a/duplicate_url_discarder/__init__.py b/duplicate_url_discarder/__init__.py
@@ -1 +1,5 @@
 __version__ = "0.1.0"
+
+from ._rule import UrlRule, load_rules, save_rules
+from .middlewares import DuplicateUrlDiscarderDownloaderMiddleware
+from .processor import Processor
diff --git a/duplicate_url_discarder/_rule.py b/duplicate_url_discarder/_rule.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+
+from url_matcher import Patterns
+
+if TYPE_CHECKING:
+    # typing.Self requires Python 3.11
+    from typing_extensions import Self
+
+
+@dataclass(frozen=True)
+class UrlRule:
+    order: int
+    url_pattern: Patterns
+    policy: str
+    args: Tuple[Any, ...]
+
+    @classmethod
+    def from_dict(cls, policy_dict: Dict[str, Any]) -> Self:
+        """Load a rule from a dict"""
+        return cls(
+            order=policy_dict["order"],
+            url_pattern=Patterns(**policy_dict["urlPattern"]),
+            policy=policy_dict["policy"],
+            args=tuple(policy_dict.get("args") or ()),
+        )
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Save a rule to a dict"""
+        pattern = {"include": list(self.url_pattern.include)}
+        if self.url_pattern.exclude:
+            pattern["exclude"] = list(self.url_pattern.exclude)
+        result = {
+            "order": self.order,
+            "urlPattern": pattern,
+            "policy": self.policy,
+        }
+        if self.args:
+            result["args"] = list(self.args)
+        return result
+
+
+def load_rules(data: str) -> List[UrlRule]:
+    """Load a list of rules from a JSON text."""
+    results: List[UrlRule] = []
+    j = json.loads(data)
+    for item in j:
+        results.append(UrlRule.from_dict(item))
+    return results
+
+
+def save_rules(policies: List[UrlRule]) -> str:
+    """Save a list of rules to a JSON text."""
+    return json.dumps(
+        [p.to_dict() for p in policies],
+        ensure_ascii=False,
+        sort_keys=True,
+        indent=2,
+    )
diff --git a/duplicate_url_discarder/middlewares.py b/duplicate_url_discarder/middlewares.py
@@ -0,0 +1,30 @@
+import os
+from typing import List, Set, Union
+
+from scrapy import Request
+from scrapy.crawler import Crawler
+from scrapy.exceptions import IgnoreRequest, NotConfigured
+from scrapy.http import Response
+
+from duplicate_url_discarder.processor import Processor
+
+
+class DuplicateUrlDiscarderDownloaderMiddleware:
+    def __init__(self, crawler: Crawler):
+        self.crawler: Crawler = crawler
+        policy_path: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
+            "DUD_LOAD_POLICY_PATH"
+        )
+        if not policy_path:
+            raise NotConfigured("No DUD_LOAD_POLICY_PATH set")
+        self.processor = Processor(policy_path)
+        self.canonical_urls: Set[str] = set()
+
+    def process_request(self, request: Request) -> Union[Request, Response, None]:
+        if not request.meta.get("dud", False):
+            return None
+        canonical_url = self.processor.process_url(request.url)
+        if canonical_url in self.canonical_urls:
+            raise IgnoreRequest(f"Duplicate URL discarded: {canonical_url}")
+        self.canonical_urls.add(canonical_url)
+        return None
diff --git a/duplicate_url_discarder/policies/__init__.py b/duplicate_url_discarder/policies/__init__.py
@@ -0,0 +1,23 @@
+from typing import Dict, Type
+
+from scrapy.utils.misc import load_object
+
+from duplicate_url_discarder._rule import UrlRule
+
+from .base import PolicyBase
+from .query_removal import QueryRemovalPolicy
+
+_POLICY_CLASSES: Dict[str, Type[PolicyBase]] = {
+    "queryRemoval": QueryRemovalPolicy,
+}
+
+
+def get_policy(rule: UrlRule) -> PolicyBase:
+    policy_cls: Type[PolicyBase]
+    if "." not in rule.policy:
+        if rule.policy not in _POLICY_CLASSES:
+            raise ValueError(f"No policy named {rule.policy}")
+        policy_cls = _POLICY_CLASSES[rule.policy]
+    else:
+        policy_cls = load_object(rule.policy)
+    return policy_cls(rule.args)
diff --git a/duplicate_url_discarder/policies/base.py b/duplicate_url_discarder/policies/base.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import Any, Tuple
+
+
+class PolicyBase(ABC):
+    def __init__(self, args: Tuple[Any, ...]):
+        self.args: Tuple[Any, ...] = args
+        self.validate_args()
+
+    def validate_args(self) -> None:  # noqa: B027
+        """Check that the policy arguments are valid, raise an exception if not."""
+        pass
+
+    @abstractmethod
+    def process(self, input_url: str) -> str:
+        """Return the input URL, modified according to the rules."""
diff --git a/duplicate_url_discarder/policies/query_removal.py b/duplicate_url_discarder/policies/query_removal.py
@@ -0,0 +1,18 @@
+from w3lib.url import url_query_cleaner
+
+from .base import PolicyBase
+
+
+class QueryRemovalPolicy(PolicyBase):
+    def validate_args(self) -> None:
+        for arg in self.args:
+            if not isinstance(arg, str):
+                raise TypeError(
+                    f"queryRemoval args must be strings, not {type(arg)}: {arg}"
+                )
+
+    def process(self, input_url: str) -> str:
+        args_to_remove = self.args
+        return url_query_cleaner(
+            input_url, args_to_remove, remove=True, unique=False, keep_fragments=True
+        )
diff --git a/duplicate_url_discarder/processor.py b/duplicate_url_discarder/processor.py
@@ -0,0 +1,48 @@
+import logging
+import operator
+import os
+from pathlib import Path
+from typing import Iterable, List, Set, Union
+
+from url_matcher import URLMatcher
+
+from duplicate_url_discarder._rule import UrlRule, load_rules
+from duplicate_url_discarder.policies import PolicyBase, get_policy
+
+logger = logging.getLogger(__name__)
+
+
+class Processor:
+    def __init__(self, policy_paths: Iterable[Union[str, os.PathLike]]) -> None:
+        rules: Set[UrlRule] = set()
+        full_rule_count = 0
+        for policy_path in policy_paths:
+            data = Path(policy_path).read_text()
+            loaded_rules = load_rules(data)
+            full_rule_count += len(loaded_rules)
+            rules.update(loaded_rules)
+        rule_count = len(rules)
+        logger.info(
+            f"Loaded {rule_count} rules, skipped {full_rule_count - rule_count} duplicates."
+        )
+
+        self.url_matcher = URLMatcher()
+        self.policies: List[PolicyBase] = []
+        policy_id = 0
+        for rule in sorted(rules, key=operator.attrgetter("order")):
+            policy = get_policy(rule)
+            self.policies.append(policy)
+            self.url_matcher.add_or_update(policy_id, rule.url_pattern)
+            policy_id += 1
+
+    def process_url(self, url: str) -> str:
+        use_universal = True
+        for policy_id in self.url_matcher.match_all(url, include_universal=False):
+            use_universal = False
+            policy = self.policies[policy_id]
+            url = policy.process(url)
+        if use_universal:
+            for policy_id in self.url_matcher.match_universal():
+                policy = self.policies[policy_id]
+                url = policy.process(url)
+        return url
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,9 @@ classifiers = [
 ]
 requires-python = ">=3.8"
 dependencies = [
+    "Scrapy >= 2.0.1",
+    "url-matcher @ git+https://github.com/zytedata/url-matcher.git@skip-domainless",
+    "w3lib >= 1.22.0",
 ]
 dynamic = ["version"]
 
@@ -40,6 +43,13 @@ duplicate_url_discarder = ["py.typed"]
 profile = "black"
 multi_line_output = 3
 
+[[tool.mypy.overrides]]
+module = [
+    "scrapy.*",
+    "url_matcher.*",
+]
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = [
     "tests.*",

diff --git a/tests/test_policies.py b/tests/test_policies.py
@@ -0,0 +1,64 @@
+import pytest
+from url_matcher import Patterns
+
+from duplicate_url_discarder import UrlRule
+from duplicate_url_discarder.policies import PolicyBase, QueryRemovalPolicy, get_policy
+
+
+class HardcodedPolicy(PolicyBase):
+    def process(self, url: str) -> str:
+        return "http://hardcoded.example"
+
+
+def test_get_policy():
+    pattern = Patterns([])
+    args = ["foo", "bar"]
+
+    rule = UrlRule(0, pattern, "queryRemoval", args)
+    policy = get_policy(rule)
+    assert type(policy) is QueryRemovalPolicy
+    assert policy.args == args
+
+    rule = UrlRule(0, pattern, "tests.test_policies.HardcodedPolicy", args)
+    policy = get_policy(rule)
+    assert type(policy) is HardcodedPolicy
+    assert policy.args == args
+
+    rule = UrlRule(0, pattern, "unknown", args)
+    with pytest.raises(ValueError, match="No policy named unknown"):
+        get_policy(rule)
+
+
+@pytest.mark.parametrize(
+    ["args", "url", "expected"],
+    [
+        ([], "http://foo.example?foo=1&bar", "http://foo.example?foo=1&bar"),
+        (["a"], "http://foo.example?foo=1&bar", "http://foo.example?foo=1&bar"),
+        (["foo"], "http://foo.example?foo=1&bar", "http://foo.example?bar"),
+        (["bar"], "http://foo.example?foo=1&bar", "http://foo.example?foo=1"),
+        (
+            ["bar"],
+            "http://foo.example?foo=1&foo=2&bar&bar=1",
+            "http://foo.example?foo=1&foo=2",
+        ),
+        (
+            ["bar"],
+            "http://foo.example?foo=1&bar#bar=frag",
+            "http://foo.example?foo=1#bar=frag",
+        ),
+        (["foo", "baz"], "http://foo.example?foo=1&bar", "http://foo.example?bar"),
+        (["foo", "bar"], "http://foo.example?foo=1&bar", "http://foo.example"),
+    ],
+)
+def test_query_removal(args, url, expected):
+    policy = QueryRemovalPolicy(args)
+    assert policy.process(url) == expected
+
+
+def test_query_removal_validate_args():
+    with pytest.raises(TypeError, match="strings, not <class 'bytes'>: b''"):
+        QueryRemovalPolicy([b""])
+    with pytest.raises(TypeError, match="strings, not <class 'NoneType'>: None"):
+        QueryRemovalPolicy(["a", None, ""])
+    QueryRemovalPolicy([""])
+    QueryRemovalPolicy([])