Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b5ed4b0
Showing
6 changed files
with
146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
html | ||
ysports |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# ---------------------- | ||
# Format HTML files | ||
# ---------------------- | ||
html: | ||
tidy -config tidy.conf -m html/*.html | ||
|
||
.PHONY: html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
golf | ||
==== | ||
|
||
Small scripts to scrape golf data from ESPN and Yahoo. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
Scrape each season of the PGA. | ||
""" | ||
|
||
import os | ||
import re | ||
|
||
import lxml.html as lh | ||
import requests as req | ||
|
||
|
||
headers = { | ||
"User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)", | ||
"Referer": "http://espn.go.com/golf" | ||
} | ||
|
||
|
||
def season(year): | ||
"""Save an ESPN PGA season's HTML.""" | ||
print year | ||
url = "http://espn.go.com/golf/schedule/_/year/{0}".format(year) | ||
text = req.get(url, headers=headers).text | ||
html = lh.fromstring(text) | ||
# Extract just the content <div> | ||
content = html.cssselect(".mod-table") | ||
content = lh.tostring(content[0]) | ||
# Save it to disk | ||
file_name = "html/{0}.html".format(year) | ||
with open(file_name, "w") as f: | ||
print "\t Saved." | ||
f.write(content) | ||
|
||
|
||
def find_tournaments(year=None): | ||
""" | ||
A semi-convoluted way of parsing the saved HTML seasons and then scraping | ||
the tourneys. | ||
""" | ||
html_files = [] | ||
if not year: | ||
for file_name in os.listdir('html'): | ||
file_name = "html/{0}".format(file_name) | ||
if not os.path.isfile(file_name): | ||
continue | ||
html_files.append(file_name) | ||
else: | ||
file_name = "html/{0}.html".format(year) | ||
html_files.append(file_name) | ||
for file_name in html_files: | ||
with open(file_name) as f: | ||
html = lh.fromstring(f.read()) | ||
tournament_links(html) | ||
|
||
|
||
def tournament_links(html): | ||
"""Find links for tournaments in a season's HTML.""" | ||
links = html.cssselect("tr td > a") | ||
for link in links: | ||
href = link.attrib['href'] | ||
if not href.startswith('/'): | ||
continue | ||
scrape_tournament(href) | ||
|
||
|
||
|
||
def scrape_tournament(endpoint): | ||
# Find the tournament ID | ||
match = re.search(r'\d+', endpoint) | ||
tournament = match.group(0) | ||
url = "http://espn.go.com{0}".format(endpoint) | ||
print url | ||
# Scrape the tournament | ||
text = req.get(url, headers=headers).text | ||
html = lh.fromstring(text) | ||
content = html.cssselect("#content") | ||
content = lh.tostring(content[0]) | ||
# Save it to disk | ||
file_name = "html/tournaments/{0}.html".format(tournament) | ||
with open(file_name, "w") as f: | ||
print "\t Saved." | ||
f.write(content) | ||
|
||
|
||
if __name__ == '__main__': | ||
#for year in range(2001, 2013): | ||
#season(year) | ||
find_tournaments() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
indent: auto | ||
indent-spaces: 2 | ||
quiet: yes | ||
input-xml: yes | ||
force-output: yes | ||
wrap: 300 | ||
write-back: yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
Scrape each season of the PGA. | ||
""" | ||
|
||
import os | ||
import re | ||
|
||
import lxml.html as lh | ||
import requests as req | ||
|
||
|
||
headers = { | ||
"User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)", | ||
"Referer": "http://sports.yahoo.com/golf/pga/schedule" | ||
} | ||
|
||
|
||
def season(year): | ||
"""Grab a Yahoo golf season's html.""" | ||
print year | ||
url = "http://sports.yahoo.com/golf/pga/schedule?season={0}".format(year) | ||
text = req.get(url, headers=headers).text | ||
html = lh.fromstring(text) | ||
# Grab the schedule | ||
schedule = html.cssselect("#schedule") | ||
schedule = lh.tostring(schedule[0]) | ||
file_name = "ysports/{0}.html".format(year) | ||
with open(file_name, "w") as f: | ||
print "\t Saved." | ||
f.write(schedule) | ||
|
||
|
||
if __name__ == '__main__': | ||
for year in range(1977, 2001): | ||
season(year) |