-
Notifications
You must be signed in to change notification settings - Fork 0
/
My1stWebCrawler.py
executable file
·54 lines (41 loc) · 1.32 KB
/
My1stWebCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
import re
import urllib
import sys
import datetime
# Basic e-mail regexp:
# letter/number/dot/comma @ letter/number/dot/comma . letter/number
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
# HTML <a> regexp
# Matches href="" attribute
link_re = re.compile(r'href="(.*?)"')
def crawl(url, maxlevel):
# Limit the recursion, we're not downloading the whole Internet
if(maxlevel == 0):
return []
# Get the webpaged
# (And catch exception if problem...)
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return []
result = []
# Check if successful
if(req.status_code != 200):
return []
# Find and follow all the links
links = link_re.findall(req.text)
for link in links:
# Get an absolute URL for a link
link = urllib.parse.urljoin(url, link)
result += crawl(link, maxlevel - 1)
# Find all emails on current page
result += email_re.findall(req.text)
return result
emails = crawl('http://www.icao.int/safety/pbn/Lists/PBNImplementation/', 2)
now = datetime.datetime.now()
filename = "gathered_emails.txt"
emailsfile = open(filename, "a",encoding='utf-8')
for e in emails:
emailsfile.write("%s\n" % e)
emailsfile.close()