Skip to content

Commit

Permalink
Merge pull request #4 from zytedata/add-code
Browse files Browse the repository at this point in the history
Initial functionality.
  • Loading branch information
wRAR committed May 9, 2024
2 parents 541dd97 + ac6421a commit 810eb02
Show file tree
Hide file tree
Showing 16 changed files with 694 additions and 4 deletions.
8 changes: 8 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[run]
branch = true

[report]
# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185
exclude_lines =
pragma: no cover
if TYPE_CHECKING:
6 changes: 6 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ ignore =
E203,
# line too long
E501,
# multiple statements on one line
E704,
# line break before binary operator
W503,

# Missing docstring in public module
D100,
Expand All @@ -21,6 +25,8 @@ ignore =
D107,
# One-line docstring should fit on one line with quotes
D200,
# No blank lines allowed after function docstring
D202,
# 1 blank line required between summary line and description
D205,
# Multi-line docstring closing quotes should be on a separate line
Expand Down
89 changes: 86 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ duplicate-url-discarder
:target: https://duplicate-url-discarder.readthedocs.io/en/stable/?badge=stable
:alt: Documentation Status

``duplicate-url-discarder`` contains Scrapy components that allow discarding
requests with duplicate URLs, using customizable policies to configure which
URLs are considered duplicate.
``duplicate-url-discarder`` contains a Scrapy fingerprinter that uses
customizable URL processors to canonicalize URLs before fingerprinting.

Quick Start
***********
Expand All @@ -37,3 +36,87 @@ Installation
pip install duplicate-url-discarder
Requires **Python 3.8+**.

Using
=====

Enable the Scrapy fingerprinter:

.. code-block:: python
REQUEST_FINGERPRINTER_CLASS = "duplicate_url_discarder.Fingerprinter"
It will make fingerprints using canonical forms of the request URLs. Requests
with the ``"dud"`` meta value set to ``False`` are processed using a fallback
fingerprinter (which is the default Scrapy one unless another one is configured
in the ``DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` setting).

URL Processors
==============

``duplicate-url-discarder`` utilizes *URL processors* to make canonical
versions of URLs. The processors are configured with *URL rules*. Each URL rule
specifies an URL pattern for which the processor applies, and specific
processor arguments to use.

The following URL processors are currently available:

* ``queryRemoval``: removes query string parameters *(i.e. key=value)*, wherein
the keys are specified in the arguments. If a given key appears multiple times
with different values in the URL, all of them are removed.

URL Rules
=========

A URL rule is a dictionary specifying the ``url-matcher`` URL pattern(s), the
URL processor name, the URL processor args and the order that is used to sort
the rules. They are loaded from JSON files that contain arrays of serialized
rules:

.. code-block:: json
[
{
"args": [
"foo",
"bar",
],
"order": 100,
"processor": "queryRemoval",
"urlPattern": {
"include": [
"foo.example"
]
}
},
{
"args": [
"PHPSESSIONID"
],
"order": 100,
"processor": "queryRemoval",
"urlPattern": {
"include": []
}
}
]
All non-universal rules (ones that have non-empty include pattern) that match
a request URL are applied according to their order field. If there are no
non-universal rules that match the URL, the universal ones are applied.

Configuration
=============

``duplicate-url-discarder`` uses the following Scrapy settings:

``DUD_LOAD_RULE_PATHS``: it should be a list of file paths (``str`` or
``pathlib.Path``) pointing to JSON files with the URL rules to apply:

.. code-block:: python
DUD_LOAD_RULE_PATHS = [
"/home/user/project/custom_rules1.json",
]
The default value of this setting is empty.
2 changes: 2 additions & 0 deletions duplicate_url_discarder/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
__version__ = "0.1.0"

from .fingerprinter import Fingerprinter
58 changes: 58 additions & 0 deletions duplicate_url_discarder/fingerprinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import logging
import os
from typing import TYPE_CHECKING, List, Union

from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.settings.default_settings import (
REQUEST_FINGERPRINTER_CLASS as ScrapyRequestFingerprinter,
)
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.request import RequestFingerprinterProtocol

from .url_canonicalizer import UrlCanonicalizer

if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self

logger = logging.getLogger(__name__)


class Fingerprinter:
def __init__(self, crawler: Crawler):
self.crawler: Crawler = crawler
rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
"DUD_LOAD_RULE_PATHS"
)
if not rule_paths:
logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.")
self._fallback_request_fingerprinter: RequestFingerprinterProtocol = (
create_instance(
load_object(
crawler.settings.get(
"DUD_FALLBACK_REQUEST_FINGERPRINTER_CLASS",
ScrapyRequestFingerprinter,
)
),
settings=crawler.settings,
crawler=crawler,
)
)
self.url_canonicalizer = UrlCanonicalizer(rule_paths)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
return cls(crawler)

def fingerprint(self, request: Request) -> bytes:
if not request.meta.get("dud", True):
self.crawler.stats.inc_value("duplicate_url_discarder/request/skipped")
return self._fallback_request_fingerprinter.fingerprint(request)
canonical_url = self.url_canonicalizer.process_url(request.url)
self.crawler.stats.inc_value("duplicate_url_discarder/request/processed")
return self._fallback_request_fingerprinter.fingerprint(
request.replace(url=canonical_url)
)
17 changes: 17 additions & 0 deletions duplicate_url_discarder/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import Dict, Type

from ..rule import UrlRule
from .base import UrlProcessorBase
from .query_removal import QueryRemovalProcessor

_PROCESSOR_CLASSES: Dict[str, Type[UrlProcessorBase]] = {
"queryRemoval": QueryRemovalProcessor,
}


def get_processor(rule: UrlRule) -> UrlProcessorBase:
processor_cls: Type[UrlProcessorBase]
if rule.processor not in _PROCESSOR_CLASSES:
raise ValueError(f"No URL processor named {rule.processor}")
processor_cls = _PROCESSOR_CLASSES[rule.processor]
return processor_cls(rule.args)
16 changes: 16 additions & 0 deletions duplicate_url_discarder/processors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from abc import ABC, abstractmethod
from typing import Any, Tuple


class UrlProcessorBase(ABC):
def __init__(self, args: Tuple[Any, ...]):
self.args: Tuple[Any, ...] = args
self.validate_args()

def validate_args(self) -> None: # noqa: B027
"""Check that the processor arguments are valid, raise an exception if not."""
pass

@abstractmethod
def process(self, input_url: str) -> str:
"""Return the input URL, modified according to the rules."""
17 changes: 17 additions & 0 deletions duplicate_url_discarder/processors/query_removal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from w3lib.url import url_query_cleaner

from .base import UrlProcessorBase


class QueryRemovalProcessor(UrlProcessorBase):
def validate_args(self) -> None:
for arg in self.args:
if not isinstance(arg, str):
raise TypeError(
f"queryRemoval args must be strings, not {type(arg)}: {arg}"
)

def process(self, input_url: str) -> str:
return url_query_cleaner(
input_url, self.args, remove=True, unique=False, keep_fragments=True
)
58 changes: 58 additions & 0 deletions duplicate_url_discarder/rule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import json
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Dict, List, Tuple

from url_matcher import Patterns

if TYPE_CHECKING:
# typing.Self requires Python 3.11
from typing_extensions import Self


@dataclass(frozen=True)
class UrlRule:
order: int
url_pattern: Patterns
processor: str
args: Tuple[Any, ...]

@classmethod
def from_dict(cls, rule_dict: Dict[str, Any]) -> Self:
"""Load a rule from a dict"""
return cls(
order=rule_dict["order"],
url_pattern=Patterns(**rule_dict["urlPattern"]),
processor=rule_dict["processor"],
args=tuple(rule_dict.get("args") or ()),
)

def to_dict(self) -> Dict[str, Any]:
"""Save a rule to a dict"""
pattern = {"include": list(self.url_pattern.include)}
if self.url_pattern.exclude:
pattern["exclude"] = list(self.url_pattern.exclude)
result = {
"order": self.order,
"urlPattern": pattern,
"processor": self.processor,
}
if self.args:
result["args"] = list(self.args)
return result


def load_rules(data: str) -> List[UrlRule]:
"""Load a list of rules from a JSON text."""
return [UrlRule.from_dict(item) for item in json.loads(data)]


def save_rules(rules: List[UrlRule]) -> str:
"""Save a list of rules to a JSON text."""
return json.dumps(
[r.to_dict() for r in rules],
ensure_ascii=False,
sort_keys=True,
indent=2,
)
48 changes: 48 additions & 0 deletions duplicate_url_discarder/url_canonicalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import logging
import operator
import os
from pathlib import Path
from typing import Dict, Iterable, Set, Union

from url_matcher import URLMatcher

from .processors import UrlProcessorBase, get_processor
from .rule import UrlRule, load_rules

logger = logging.getLogger(__name__)


class UrlCanonicalizer:
def __init__(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None:
rules: Set[UrlRule] = set()
full_rule_count = 0
for rule_path in rule_paths:
data = Path(rule_path).read_text()
loaded_rules = load_rules(data)
full_rule_count += len(loaded_rules)
rules.update(loaded_rules)
rule_count = len(rules)
logger.info(
f"Loaded {rule_count} rules, skipped {full_rule_count - rule_count} duplicates."
)

self.url_matcher = URLMatcher()
self.processors: Dict[int, UrlProcessorBase] = {}
rule_id = 0
for rule in sorted(rules, key=operator.attrgetter("order")):
processor = get_processor(rule)
self.processors[rule_id] = processor
self.url_matcher.add_or_update(rule_id, rule.url_pattern)
rule_id += 1

def process_url(self, url: str) -> str:
use_universal = True
for rule_id in self.url_matcher.match_all(url, include_universal=False):
use_universal = False
processor = self.processors[rule_id]
url = processor.process(url)
if use_universal:
for rule_id in self.url_matcher.match_universal():
processor = self.processors[rule_id]
url = processor.process(url)
return url
12 changes: 11 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "duplicate-url-discarder"
description = "Discarding duplicate URLs based on policies."
description = "Discarding duplicate URLs based on rules."
authors = [{name = "Zyte Group Ltd", email = "info@zyte.com"}]
readme = "README.rst"
license = {file = "LICENSE"}
Expand All @@ -24,6 +24,9 @@ classifiers = [
]
requires-python = ">=3.8"
dependencies = [
"Scrapy >= 2.7.0",
"url-matcher >= 0.5.0",
"w3lib >= 1.22.0",
]
dynamic = ["version"]

Expand All @@ -40,6 +43,13 @@ duplicate_url_discarder = ["py.typed"]
profile = "black"
multi_line_output = 3

[[tool.mypy.overrides]]
module = [
"scrapy.*",
"url_matcher.*",
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"tests.*",
Expand Down
Loading

0 comments on commit 810eb02

Please sign in to comment.