-
Notifications
You must be signed in to change notification settings - Fork 92
/
Copy pathvalidate_links.py
50 lines (40 loc) · 1.31 KB
/
validate_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import re
from urllib.parse import unquote
pat = re.compile(r'(href|src)="([^"]+)"')
hash = re.compile(r'#.+?$')
ignore_exts = ".py", ".png", ".jpg", ".jpeg", ".gif", ".zip", ".ico", ".js", ".woff"
external = set()
non_existing = set()
where = {}
for root, dirs, files in os.walk('.'):
for nm in files:
if "/.git" in root:
continue
if any(nm.lower().endswith(ext) or (ext+"?") in nm.lower() for ext in ignore_exts):
continue
fn = os.path.join(root, nm)
base = os.path.dirname(fn)
for _, pth in pat.findall(open(fn).read()):
pth = unquote(pth)
pth = hash.sub('', pth)
if pth.startswith('mailto:'):
continue
elif pth.startswith('http://') or pth.startswith('https://'):
external.add(pth)
continue
elif pth.startswith('/'):
absolute = pth[1:]
else:
absolute = os.path.join(base, pth)
if not os.path.exists(absolute):
if '#' in absolute:
breakpoint()
non_existing.add(absolute)
where[absolute] = fn
print("External")
for l in sorted(external):
print(l)
print("404")
for l in sorted(non_existing):
print(where[l], l)