Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
zachwill committed May 12, 2013
0 parents commit b5ed4b0
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
html
ysports
7 changes: 7 additions & 0 deletions Makefile
@@ -0,0 +1,7 @@
# ----------------------
# Format HTML files
# ----------------------
html:
tidy -config tidy.conf -m html/*.html

.PHONY: html
4 changes: 4 additions & 0 deletions README.md
@@ -0,0 +1,4 @@
golf
====

Small scripts to scrape golf data from ESPN and Yahoo.
89 changes: 89 additions & 0 deletions espn.py
@@ -0,0 +1,89 @@
#!/usr/bin/env python

"""
Scrape each season of the PGA.
"""

import os
import re

import lxml.html as lh
import requests as req


headers = {
"User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)",
"Referer": "http://espn.go.com/golf"
}


def season(year):
"""Save an ESPN PGA season's HTML."""
print year
url = "http://espn.go.com/golf/schedule/_/year/{0}".format(year)
text = req.get(url, headers=headers).text
html = lh.fromstring(text)
# Extract just the content <div>
content = html.cssselect(".mod-table")
content = lh.tostring(content[0])
# Save it to disk
file_name = "html/{0}.html".format(year)
with open(file_name, "w") as f:
print "\t Saved."
f.write(content)


def find_tournaments(year=None):
"""
A semi-convoluted way of parsing the saved HTML seasons and then scraping
the tourneys.
"""
html_files = []
if not year:
for file_name in os.listdir('html'):
file_name = "html/{0}".format(file_name)
if not os.path.isfile(file_name):
continue
html_files.append(file_name)
else:
file_name = "html/{0}.html".format(year)
html_files.append(file_name)
for file_name in html_files:
with open(file_name) as f:
html = lh.fromstring(f.read())
tournament_links(html)


def tournament_links(html):
"""Find links for tournaments in a season's HTML."""
links = html.cssselect("tr td > a")
for link in links:
href = link.attrib['href']
if not href.startswith('/'):
continue
scrape_tournament(href)



def scrape_tournament(endpoint):
# Find the tournament ID
match = re.search(r'\d+', endpoint)
tournament = match.group(0)
url = "http://espn.go.com{0}".format(endpoint)
print url
# Scrape the tournament
text = req.get(url, headers=headers).text
html = lh.fromstring(text)
content = html.cssselect("#content")
content = lh.tostring(content[0])
# Save it to disk
file_name = "html/tournaments/{0}.html".format(tournament)
with open(file_name, "w") as f:
print "\t Saved."
f.write(content)


if __name__ == '__main__':
#for year in range(2001, 2013):
#season(year)
find_tournaments()
7 changes: 7 additions & 0 deletions tidy.conf
@@ -0,0 +1,7 @@
indent: auto
indent-spaces: 2
quiet: yes
input-xml: yes
force-output: yes
wrap: 300
write-back: yes
37 changes: 37 additions & 0 deletions yahoo.py
@@ -0,0 +1,37 @@
#!/usr/bin/env python

"""
Scrape each season of the PGA.
"""

import os
import re

import lxml.html as lh
import requests as req


headers = {
"User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)",
"Referer": "http://sports.yahoo.com/golf/pga/schedule"
}


def season(year):
"""Grab a Yahoo golf season's html."""
print year
url = "http://sports.yahoo.com/golf/pga/schedule?season={0}".format(year)
text = req.get(url, headers=headers).text
html = lh.fromstring(text)
# Grab the schedule
schedule = html.cssselect("#schedule")
schedule = lh.tostring(schedule[0])
file_name = "ysports/{0}.html".format(year)
with open(file_name, "w") as f:
print "\t Saved."
f.write(schedule)


if __name__ == '__main__':
for year in range(1977, 2001):
season(year)

0 comments on commit b5ed4b0

Please sign in to comment.