-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
67 lines (57 loc) · 1.78 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from bs4 import BeautifulSoup
from urllib2 import urlopen
from datetime import date
import csv
categories = {
'assists' : 'APG',
'blocks' : 'BLKPG',
'rebounds' : 'RPG',
'scoring-per-game' : 'PTS',
'steals' : 'STPG'
}
years = [date.today().year - 1, date.today().year]
years = [str(year) for year in years]
print years
def get_tables():
for category in categories.keys():
for year in years:
# Initialize rows
rank_count = 1
# Create a csv file
filename = 'stats/' + category + '_' + str(year) + '.csv'
csvfile = open(filename, 'wb')
writer = csv.writer(csvfile)
for row_start in range(1,300, 40):
rows = []
filter_rows = []
# Open connection to the site
url = 'http://espn.go.com/nba/statistics/player/_/stat/' + str(category) + '/year/' + str(year) + '/count/' + str(row_start)
soup = BeautifulSoup(urlopen(url))
# Get table from html
table = soup.find('table')
# Get and write table headers
if (row_start == 1):
headers = [header.text for header in table.find('tr', attrs={"class" : "colhead"})]
headers[0] = 'RANK' + '_' + category + '_' + str(year)
for header in headers[2:]:
header = header + '_' + category + '_' + str(year)
writer.writerow(headers)
# Get table rows
for row in table.find_all('tr'):
rows.append([val.text.encode('utf-8') for val in row.find_all('td')])
# Write out rows and ranks for players
for row in rows:
if row[1:] != headers[1:]:
# Write rank
row[0] = rank_count
rank_count += 1
# Remove player position
row[1] = row[1].split(',')[0]
filter_rows.append(row)
# Write to file
writer.writerows(row for row in filter_rows)
# Close file
csvfile.close()
# Run main function
if __name__ == "__main__":
get_tables()