-
Notifications
You must be signed in to change notification settings - Fork 0
/
pe_scrape.py
121 lines (86 loc) · 3.34 KB
/
pe_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
import re
import json
import shutil
from bs4 import BeautifulSoup
def get_title(soup):
return soup.h2.text
def get_info(soup):
# get raw info section
raw_info = soup.find('span', {'style' : 'left:-400px;width:450px;font-size:80%;'})
# get published, solved, and difficulty information using regex
published = re.search(r'Published on (.+);', str(raw_info))
solved = re.search(r'Solved by (\d+)', str(raw_info))
difficulty = re.search(r'Difficulty rating: (\d+)%', str(raw_info))
return {
'published' : published.group(1),
'solved' : int(solved.group(1)),
# since newly published problems don't have a difficulty, put None if it isn't there
'difficulty' : int(difficulty.group(1)) if difficulty != None else None
}
def get_images(problem_content):
images = []
for tag in problem_content.descendants:
if tag.name == 'img' and tag['src'] != 'images/spacer.gif':
path = tag['src']
images.append(path)
r = requests.get('http://projecteuler.net/' + path, stream=True)
with open('./data/images/{}'.format(path[15:]), 'wb+') as f:
shutil.copyfileobj(r.raw, f)
return images
def get_files(problem_content):
files = []
for tag in problem_content.descendants:
if tag.name == 'a' and 'project' in tag['href']:
path = tag['href']
files.append(path)
r = requests.get('http://projecteuler.net/' + path)
with open('./data/files/{}'.format(path[18:]), 'w+') as f:
f.write(r.text)
return files
def get_content(problem_content):
# somehow unwrapping the <div> tag would be better, can't get it to work though
# so I'll just use regex to get stuff inside the tag
html = re.search(r'<div class="problem_content" role="problem">(.*)<\/div>', str(problem_content), re.S).group(1)
# get just text
text = problem_content.get_text()
return {
'text' : text,
'html' : html,
'images' : get_images(problem_content),
'files' : get_files(problem_content)
}
def scrape(problem):
url = 'https://projecteuler.net/problem=' + str(problem)
r = requests.get(url)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
# get raw problem content
problem_content = soup.find('div', {'class' : 'problem_content'})
return {
'number' : problem,
'url' : url,
'title' : get_title(soup),
'info' : get_info(soup),
'content' : get_content(problem_content)
}
def get_num_problems():
r = requests.get('http://projecteuler.net/recent')
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
table_rows = soup.find('table', {'id':'problems_table'}).find_all('tr')
most_recent = table_rows[1]
most_recent_number = most_recent.find('td').text
return int(most_recent_number)
##########################################################################################################
start = 1
# stop = 10
stop = get_num_problems()
filename = './data/{}_{}.json'.format(start, stop)
output = {}
for num in range(start, stop + 1):
print('Scraping problem {} / {}'.format(num, stop), end='\r')
output[num] = scrape(num)
with open(filename, 'w+') as f:
f.write(json.dumps(output, indent=2))
print('\nProblems {} - {} successfully scraped and saved to {}'.format(start, stop, filename))