#Chicago Marathon data scraper

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
from bs4 import BeautifulSoup as bs
import requests
import time
import datetime
import json
import string
import os 
import re

#Small helpers

## Converts 01:30:00 -> 5400
## Discards decimals ...
def timestring_to_sec(ts):
    if pd.notnull(ts):
        return sum(int(x) * 60 ** i for i,x in enumerate(reversed(ts.split(".")[0].split(":"))))
    else:
        return ts

## Converts 01:30:00 -> 90

def timestring_to_min(ts):
    if pd.notnull(ts):
        return timestring_to_sec(ts)/60.
    else:
        return ts

## Converts 5400 -> 01:30:00 
def sec_to_timestring (seconds): 
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    string = "%d:%02d:%02d" % (h, m, s)
    return string

## 90 -> 01:30:00 
min_to_timestring = lambda min: sec_to_timestring(min*60)

First we download the tables of 1000 results each, so we can gather links to individual runner data 

In [None]:
# Basic scraper for Chicago Marathon results

# URL format


year = "2015"
url_start = "http://results.chicagomarathon.com/"+ year + "/?page=" # swap to 2015 for 2015
url_mid = "&event=MAR&lang=EN_CAP&num_results=1000&pid=list&search[sex]="

# Needs to end in M for men and W for women

# We know from manual browsing that there are 21 pages for men and 18 for women in 2015,
# 23 pages for men and 19 for women in 2014.

pages_text_m = []
pages_text_w = []
count_m = 23
count_w = 19

def get_pages(pages_text_list, count, gender):
    for p in range(1, count + 1):
        # so we know the progress
        print p,
        pages_text_list.append(bs(requests.get(url_start + str(p) + url_mid + gender).text,
                                  "html"))
        time.sleep(1)
        
get_pages(pages_text_m, count_m, "M")
get_pages(pages_text_w, count_w, "W")


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 

In [331]:
#Scrap split times for individual runner
def get_splits(url):
    time.sleep(.01)
    soup = bs(requests.get(url).text,"html")
    tables = soup.findAll("table", { "class" : "list-table names" })
 
    runner = pd.read_html(tables[0].prettify(), index_col=0)[0][1]
    splits = pd.read_html(tables[4].prettify(), index_col=0)[0]['Time']
    return pd.concat([runner,splits], axis=0)


url = "http://results.chicagomarathon.com/2015/?content=detail&fpid=list&pid=list&idp=999999107FA30900001756C8&lang=EN_CAP&event=MAR&lang=EN_CAP&num_results=1000&search%5Bsex%5D=M&search_event=MAR"
get_splits(url)

Name (CTZ)     Chumba, Dickson (KEN)
Age Group                      25-29
Bib Number                         3
Age                               28
City, State                      NaN
05K                         00:15:31
10K                         00:30:46
15K                         00:46:00
20K                         01:01:46
HALF                        01:05:13
25K                         01:17:24
30K                         01:33:16
35K                         01:47:52
40K                         02:02:43
Finish                      02:09:25
dtype: object

##Actually do the scraping -- may take up to four hours to run!

In [346]:
def get_split_table(soup):
    print ".",
    base = "http://results.chicagomarathon.com/" + year + "/"
    links = soup.findAll("table", { "class" : "list-table" })[0].findAll('a',href=re.compile('^\?content=detail')) 
    results = [get_splits((base + l['href'])) for l in links]
    return pd.concat(results,axis=1).transpose()

print "Scraping men"
%time men = [get_split_table(p) for p in pages_text_m]
print "Scraping women"
%time women = [get_split_table(p) for p in pages_text_w]


In [123]:
pd.concat(men,axis=1)
men_df = pd.concat(men,axis=0)
men_df['Gender1F2M'] = 2
men_df.head()

Unnamed: 0,Name (CTZ),Age Group,Bib Number,Age,"City, State",05K,10K,15K,20K,HALF,25K,30K,35K,40K,Finish,Gender1F2M
0,"Kipchoge, Eliud (KEN)",25-29,2,29,,00:14:44,00:29:30,00:44:16,00:59:02,01:02:11,01:13:42,01:28:46,01:43:22,01:57:53,02:04:11,2
1,"Kitwara, Sammy (KEN)",25-29,6,27,,00:14:43,00:29:30,00:44:17,00:59:02,01:02:12,01:13:42,01:28:46,01:43:21,01:57:59,02:04:28,2
2,"Chumba, Dickson (KEN)",25-29,7,27,,00:14:44,00:29:31,00:44:17,00:59:03,01:02:12,01:13:43,01:28:49,01:43:22,01:58:01,02:04:32,2
3,"Bekele, Kenenisa (ETH)",30-34,1,32,,00:14:44,00:29:31,00:44:17,00:59:03,01:02:12,01:13:43,01:28:47,01:43:34,01:58:58,02:05:51,2
4,"Koech, Bernard (KEN)",25-29,5,26,,00:14:43,00:29:30,00:44:18,00:59:03,01:02:12,01:13:42,01:28:47,01:43:32,02:00:11,02:08:30,2


In [124]:
women_df = pd.concat(wemon,axis=0)
women_df['Gender1F2M'] = 1
women_df.head()

Unnamed: 0,Name (CTZ),Age Group,Bib Number,Age,"City, State",05K,10K,15K,20K,HALF,25K,30K,35K,40K,Finish,Gender1F2M
0,"Jeptoo, Rita (KEN)",30-34,101,33,,00:17:16,00:34:31,00:51:42,01:08:58,01:12:36,01:25:54,01:43:28,02:00:45,02:17:22,02:24:35,1
1,"Dibaba, Mare (ETH)",20-24,103,24,,00:17:16,00:34:32,00:51:43,01:08:57,01:12:36,01:25:52,01:43:28,02:00:46,02:17:56,02:25:37,1
2,"Kiplagat, Florence (KEN)",25-29,102,27,,00:17:15,00:34:31,00:51:42,01:08:57,01:12:35,01:25:52,01:43:27,02:00:45,02:17:56,02:25:57,1
3,"Dibaba, Birhane (ETH)",20-24,106,21,,00:17:15,00:34:32,00:51:43,01:08:58,01:12:35,01:25:53,01:43:28,02:00:47,02:18:25,02:27:02,1
4,"Hastings, Amy (USA)",30-34,107,30,,00:17:12,00:34:22,00:51:43,01:09:06,01:12:46,01:26:11,01:43:30,02:01:03,02:19:09,02:27:03,1


In [127]:
Chicago_df = pd.concat([women_df,men_df],axis=0)
Chicago_df['Year'] = year
Chicago_df.tail()

Unnamed: 0,Name (CTZ),Age Group,Bib Number,Age,"City, State",05K,10K,15K,20K,HALF,25K,30K,35K,40K,Finish,Gender1F2M,Year
194,"Dollete, Joe Allain (USA)",25-29,48264,28,Prince George,01:31:21,02:12:21,02:56:55,03:48:30,03:59:32,04:43:32,05:41:36,-,-,08:05:37,2,2014
195,"Nachel, James (USA)",60-64,51519,63,Western Springs,-,-,-,03:18:49,03:29:09,04:13:24,-,05:43:06,06:26:55,08:12:29,2,2014
196,"Gregory, Mark (USA)",55-59,51515,59,Dekalb,-,-,-,-,-,-,-,-,03:30:29,08:12:30,2,2014
197,"Schefer, Francis (USA)",55-59,30656,55,Alpharetta,-,-,-,-,01:51:30,03:57:42,-,04:49:37,05:50:45,08:12:31,2,2014
198,"Cahill, Gary (USA)",45-49,48260,45,Chicago,00:48:23,01:37:02,02:29:55,-,-,04:30:51,-,-,-,08:30:00,2,2014


In [233]:
#Save file to disk
fn = "Chicago" + year + "Original.csv"
Chicago_df.to_csv(fn, header=True, index=False, encoding='utf-8')

#Chicago_df2 = pd.read_csv(fn, header=True, encoding='utf-8')

Function below does the following:
-	Converts dashes to nans
-	Renames split headers in accordance with our data format
-	 Converts from split aggregate times to split differences


In [343]:
dash_to_nan = lambda s: np.nan if s == '-' else s
mappings = {'05K':'K0-5','10K':'K5-10','15K':'K10-15', '20K':'K15-20', '25K':'K20-25', '30K':'K25-30','35K':'K30-35','40K':'K35-40','Finish':'K40-Fin','HALF':'HalfMar'}

def reformat_df(df):
    global year
    df_c = df.copy()
    
    for k,v in mappings.items():
        df_c[v] = df_c[k].apply(dash_to_nan)
        df_c[v] = df_c[v].apply(timestring_to_min)

    df_c.index = df_c['Bib Number']
    df_c.index.names = ['BibNum']
    
    df_meta = df_c[['Year','Age','Gender1F2M']].copy()
    df_meta['StartHr'] = np.nan
    df_meta['StartMin'] = np.nan
    
    df_sub = df_c[['K40-Fin','K35-40','K30-35','K25-30','K20-25','K15-20','K10-15','K5-10','K0-5']].copy()
    df_sub['K0'] = 0
    df_filled = df_sub.interpolate(axis=1)
    
    df_diff = pd.DataFrame()
    for i,c in enumerate(df_filled.columns):
        maxlength = (len(df_filled.columns) - 1) 
        if (i < maxlength):
            df_diff[c] = df_filled[c] - df_filled[(df_filled.columns[i+1])]
        
    
    splits= df_diff[df_diff.columns[::-1]]
    fdf = pd.concat([df_meta,splits, df_c['HalfMar']],axis =1)
    fdf['Age2014'] = fdf.Age.apply(int) + (int(year)-2014)
    return fdf
     
diffed = reformat_df(Chicago_df)                   
diffed.head()

Unnamed: 0_level_0,Year,Age,Gender1F2M,StartHr,StartMin,K0-5,K5-10,K10-15,K15-20,K20-25,K25-30,K30-35,K35-40,K40-Fin,HalfMar,Age2014
BibNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
101,2014,33,1,,,17.266667,17.25,17.183333,17.266667,16.933333,17.566667,17.283333,16.616667,7.216667,72.6,33
103,2014,24,1,,,17.266667,17.266667,17.183333,17.233333,16.916667,17.6,17.3,17.166667,7.683333,72.6,24
102,2014,27,1,,,17.25,17.266667,17.183333,17.25,16.916667,17.583333,17.3,17.183333,8.016667,72.583333,27
106,2014,21,1,,,17.25,17.283333,17.183333,17.25,16.916667,17.583333,17.316667,17.633333,8.616667,72.583333,21
107,2014,30,1,,,17.2,17.166667,17.35,17.383333,17.083333,17.316667,17.55,18.1,7.9,72.766667,30





In [344]:
#Save to disk
fn = "Chicago" + year + "Formated.csv"
diffed.to_csv(fn, header=True, index=True, encoding='utf-8')