First commit

zachwill · May 12, 2013 · b5ed4b0 · b5ed4b0
commit b5ed4b0
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+html
+ysports
diff --git a/Makefile b/Makefile
@@ -0,0 +1,7 @@
+# ----------------------
+#  Format HTML files
+# ----------------------
+html:
+	tidy -config tidy.conf -m html/*.html
+
+.PHONY: html
diff --git a/README.md b/README.md
@@ -0,0 +1,4 @@
+golf
+====
+
+Small scripts to scrape golf data from ESPN and Yahoo.
diff --git a/espn.py b/espn.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+"""
+Scrape each season of the PGA.
+"""
+
+import os
+import re
+
+import lxml.html as lh
+import requests as req
+
+
+headers = {
+    "User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)",
+    "Referer": "http://espn.go.com/golf"
+}
+
+
+def season(year):
+    """Save an ESPN PGA season's HTML."""
+    print year
+    url = "http://espn.go.com/golf/schedule/_/year/{0}".format(year)
+    text = req.get(url, headers=headers).text
+    html = lh.fromstring(text)
+    # Extract just the content <div>
+    content = html.cssselect(".mod-table")
+    content = lh.tostring(content[0])
+    # Save it to disk
+    file_name = "html/{0}.html".format(year)
+    with open(file_name, "w") as f:
+        print "\t Saved."
+        f.write(content)
+
+
+def find_tournaments(year=None):
+    """
+    A semi-convoluted way of parsing the saved HTML seasons and then scraping
+    the tourneys.
+    """
+    html_files = []
+    if not year:
+        for file_name in os.listdir('html'):
+            file_name = "html/{0}".format(file_name)
+            if not os.path.isfile(file_name):
+                continue
+            html_files.append(file_name)
+    else:
+        file_name = "html/{0}.html".format(year)
+        html_files.append(file_name)
+    for file_name in html_files:
+        with open(file_name) as f:
+            html = lh.fromstring(f.read())
+        tournament_links(html)
+
+
+def tournament_links(html):
+    """Find links for tournaments in a season's HTML."""
+    links = html.cssselect("tr td > a")
+    for link in links:
+        href = link.attrib['href']
+        if not href.startswith('/'):
+            continue
+        scrape_tournament(href)
+
+
+
+def scrape_tournament(endpoint):
+    # Find the tournament ID
+    match = re.search(r'\d+', endpoint)
+    tournament = match.group(0)
+    url = "http://espn.go.com{0}".format(endpoint)
+    print url
+    # Scrape the tournament
+    text = req.get(url, headers=headers).text
+    html = lh.fromstring(text)
+    content = html.cssselect("#content")
+    content = lh.tostring(content[0])
+    # Save it to disk
+    file_name = "html/tournaments/{0}.html".format(tournament)
+    with open(file_name, "w") as f:
+        print "\t Saved."
+        f.write(content)
+
+
+if __name__ == '__main__':
+    #for year in range(2001, 2013):
+        #season(year)
+    find_tournaments()
diff --git a/tidy.conf b/tidy.conf
@@ -0,0 +1,7 @@
+indent: auto
+indent-spaces: 2
+quiet: yes
+input-xml: yes
+force-output: yes
+wrap: 300
+write-back: yes
diff --git a/yahoo.py b/yahoo.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+"""
+Scrape each season of the PGA.
+"""
+
+import os
+import re
+
+import lxml.html as lh
+import requests as req
+
+
+headers = {
+    "User-Agent": "Mozilla/5.0 (ESPN) AppleWebKit/535.30 (KHTML, like Gecko)",
+    "Referer": "http://sports.yahoo.com/golf/pga/schedule"
+}
+
+
+def season(year):
+    """Grab a Yahoo golf season's html."""
+    print year
+    url = "http://sports.yahoo.com/golf/pga/schedule?season={0}".format(year)
+    text = req.get(url, headers=headers).text
+    html = lh.fromstring(text)
+    # Grab the schedule
+    schedule = html.cssselect("#schedule")
+    schedule = lh.tostring(schedule[0])
+    file_name = "ysports/{0}.html".format(year)
+    with open(file_name, "w") as f:
+        print "\t Saved."
+        f.write(schedule)
+
+
+if __name__ == '__main__':
+    for year in range(1977, 2001):
+        season(year)