-
Notifications
You must be signed in to change notification settings - Fork 3
/
url.py
127 lines (103 loc) · 4.4 KB
/
url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""URL parsing and processing functions."""
import json
import logging
import re
from urllib.parse import parse_qsl, unquote, urlencode, urlparse, urlunparse
from urllib.request import Request, urlopen
log = logging.getLogger(__name__)
def extract_url(contents):
"""Attempts to find and extract a URL from the given content."""
# Based on https://stackoverflow.com/a/840110
match = re.search(r'(?P<url>https?://[^\s]+)', contents)
if match and match.group('url'):
return match.group('url')
return None
URL_CLEARURLS_DATA = 'https://rules2.clearurls.xyz/data.minify.json'
USER_AGENT = 'uroute URLCleaner (python urllib)'
def download_rules_data(save_path=None):
"""Download URL cleaning rules to `save_path`."""
log.debug('Downloading rules data to %r', save_path)
with open(save_path, 'w', encoding='UTF-8') as rules_file:
request = Request(URL_CLEARURLS_DATA,
headers={'User-Agent': USER_AGENT})
with urlopen(request) as resp:
rules_file.write(resp.read().decode())
def load_cleaning_rules(rules_path):
"""Loads URL cleaning data from `rules_path`.
If the specified rules file path does not point to a valid JSON
file, ClearURLs's `data.min.json
<https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json>`_
is automatically downloaded and loaded.
"""
try:
with open(rules_path, encoding='UTF-8') as rules_file:
rules = json.load(rules_file)
log.debug('URL cleaning rules loaded from %r', rules_path)
return rules
except Exception: # pylint: disable=broad-except
# If anything went wrong reading the rules file, redownload
# it.
download_rules_data(rules_path)
with open(rules_path, encoding='UTF-8') as rules_file:
rules = json.load(rules_file)
log.debug('URL cleaning rules loaded from %r', rules_path)
return rules
def clean_url(rules, url, recurse_redir=True):
"""Clean the given URL with the loaded rules data.
The format of `rules_data` is the parsed JSON found in ClearURLs's
[`data.min.json`](https://gitlab.com/ClearURLs/rules/-/blob/master/data.min.json)
file.
URLs matching a provider's `urlPattern` and one of that provider's
redirection patterns, will cause the URL to be replaced with the
match's first matched group.
Another Python implementation to download and apply the rules to a
URL, written by the ClearURLs author, can be found
[here](https://gitlab.com/KevinRoebert/ClearUrls/snippets/1834899).
Set `recurse_redir=False` to prevent cleaning redirect targets
recursively.
"""
for provider in rules.get('providers', {}).values():
if not re.match(provider['urlPattern'], url, re.IGNORECASE):
continue
# If any exceptions are matched, this provider is skipped
if any(
re.match(exc, url, re.IGNORECASE)
for exc in provider.get('exceptions', [])
):
continue
for redir in provider.get('redirections', []):
match = re.match(redir, url, re.IGNORECASE)
try:
if match and match.group(1) and match.group(1) != url:
url = unquote(match.group(1))
# If redirect found, recurse on target
if recurse_redir:
url = clean_url(rules, url, recurse_redir=True)
return url
except IndexError:
# If we get here, we got a redirection match, but no
# matched grouped. The redirection rule is probably
# faulty.
pass
# Explode query parameters to be checked against rules
parsed_url = urlparse(url)
query_params = parse_qsl(parsed_url.query)
for rule in (
*provider.get('rules', []),
*provider.get('referralMarketing', [])
):
query_params = [
param for param in query_params
if not re.match(rule, param[0], re.IGNORECASE)
]
url = urlunparse((
parsed_url.scheme,
parsed_url.netloc,
parsed_url.path,
parsed_url.params,
urlencode(query_params),
parsed_url.fragment,
))
for raw_rule in provider.get('rawRules', []):
url = re.sub(raw_rule, '', url)
return url