-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path09_basic_link_web_crawler.py
54 lines (39 loc) · 1.51 KB
/
09_basic_link_web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
# regex
link_re = re.compile(r'href="(.*?)"')
def crawl(url):
req = requests.get(url)
# Check if successful
if req.status_code != 200:
return []
# Find links
links = link_re.findall(req.text)
print("\nFound {} links".format(len(links)))
# Search links for emails
for link in links:
# Get an absolute URL for a link
link = urljoin(url, link)
print(link)
if __name__ == "__main__":
crawl("https://lambda-static-server.netlify.app/")
#
# python3 09_basic_link_web_crawler.py
#
# Found 11 links
# https://lambda-static-server.netlify.app/assets/prism.css
# https://lambda-static-server.netlify.app/assets/style.css
# https://lambda-static-server.netlify.app/directory.html
# https://lambda-static-server.netlify.app/1-projects/directory.html
# https://lambda-static-server.netlify.app/2-content/directory.html
# https://lambda-static-server.netlify.app/1-projects/directory.html
# https://lambda-static-server.netlify.app/3-misc/directory.html
# https://lambda-static-server.netlify.app/4-assets/directory.html
# https://lambda-static-server.netlify.app/5-websites/directory.html
# https://lambda-static-server.netlify.app/13-web-tools/directory.html
# https://lambda-static-server.netlify.app/Interview/directory.html
# |05:08:38|bryan@LAPTOP-9LGJ3JGS:[scripts] scripts_exitstatus:0[╗__________________________________________________________o>