<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Athlete-data" data-toc-modified-id="Athlete-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Athlete data</a></span></li><li><span><a href="#Wetsuit-data" data-toc-modified-id="Wetsuit-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Wetsuit data</a></span></li><li><span><a href="#Export-html" data-toc-modified-id="Export-html-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Export html</a></span></li></ul></div>

This code scrapes Ironman finisher data at the athlete level.

In [None]:
import pandas as pd
import numpy as np
import re
import json
from lxml import html
import requests
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 100)

## Athlete data

In [None]:
# define variable names (in order as appear on webpage)

dfcolumns = ['athletename', 'athleteid', 'athleteurl', 'bib_n', 'gender', 'division', \
             'divurl', 'konadiv', 'seriesid', 'seriesurl', 'raceid', \
             'raceurl', 'racename', 'raceyear', 'racedate', 'racetype', \
             'konayear', 'konaslots', 'swimtime', 'swimrank', 'swimrankdiv', \
             'swimrankgender', 'biketime', 'bikerank', 'bikerankdiv', \
             'bikerankgender', 'runtime', 'runrank', 'runrankdiv', \
             'runrankgender', 'overalltime', 'overallrank', 'overallrankdiv', \
             'overallrankgender', 'konarankdiv', 'trans1time', 'trans1rank', \
             'trans1rankdiv', 'trans1rankgender', 'trans2time', 'trans2rank', \
             'trans2rankdiv', 'trans2rankgender', 'finish', 'autokonaqual']


# define scraping function

def get_ath_df(userid):
    """Returns athlete-race level dataframe
    """    
    print("Aquiring data for user: %d       " % userid, end='\r')
    
    # get web response
    urlbase = "https://www.coachcox.co.uk/imstats/athlete/"
    url = urlbase + str(userid)
    response = requests.get(url)
    html = response.content
        
    # translate to BeautifulSoup object
    soup = BeautifulSoup(html, 'lxml')
    
    # find athlete data
    s = soup.find("script", text=re.compile(r"var imathleteresultstable"))
    try:
        s = s.contents[0]
    except:
        # no data found, return null
        return None
    
    # within the JS script, find the array with relevant data
    start = 'var imathleteresultsdata = '
    end = ';'
    datastring = s[s.find(start)+len(start):s.find(end)]
    
    # get athlete name
    athletename = soup.find("title").contents[0]
    athletename = athletename[13:athletename.find(" Race Results")]

    # turn data into df and clean/label
    data = json.loads(datastring)
    dfath = pd.DataFrame(data)
    dfath.insert(loc=0, column='athletename', value=athletename)
    dfath.columns = dfcolumns
    dfath.loc[:,'racedate'] = dfath.racedate.apply(lambda x: x['d'])
    #dfath.loc[:,'racedate'] = dfath.racedate.apply(lambda x: dt.strptime(x['d'], "%d %b %Y"))

    return dfath 


# demonstrate use for one athlete

df = get_ath_df(184495)
df.head()

In [None]:
# get data for a few users and append to one df

df_array = []
nodata = []

for i in range(23786, 23786 + 10):
    dfath = get_ath_df(i)
    if isinstance(dfath, pd.DataFrame):
        df_array.append(dfath)
    else:
        nodata.append(i)
        
df_all = pd.concat(df_array, 0)
        
print("\ndone\n")
print("Total athletes captured: {}".format(len(df_all.athleteid.unique())))
print("Total athletes failed: {}".format(len(nodata)))

df_all.head()

## Wetsuit data

In [None]:
# New table with location and date
url = "https://www.roka.com/pages/average-ironman-water-temperatures"
response = requests.get(url)

html = response.content

soup = BeautifulSoup(html, 'lxml')
    
# find wetsuit data
wetsuit_data = soup.find_all("div", {'class': 'row_190425_water_temp_chart'})



race_data_lst = []
for idx, w in enumerate(wetsuit_data):
    race_data = w.find_all("div", {'class': 'line_190425_water_temp_chart'})
    weather_data = w.find_all("div", {'class': 'small_stratum_bottom_190425_water_temp_chart'})
    location_data = w.find_all("div", {'class': 'small_stratum_top_190425_water_temp_chart'})
    
    
    if race_data[3].text.strip() == '|':
        suit2 = '/' + race_data[4].text
    else:
        suit2 = ''
    race_data_lst.append(pd.DataFrame(
        {'venuename': race_data[0].text,
        'watertemp': race_data[1].text,
        'suittype': (race_data[2].text + suit2),
        'date': weather_data[0].text,
        'location':location_data[0].text}, index=[idx]))

dfwater = pd.concat(race_data_lst, 0)
display(dfwater.head())

# lowercase the venue names and suit type names 
for v in ['venuename', 'suittype']:
    dfwater[v] = dfwater[v].apply(lambda x: x.lower())

    
# convert celsius to farenheight
def temp_to_f(x):
    '''
    Takes a temperature str "x" returns the numeric temp in F
    '''
    if x[-1] == "C":
        return round(int(x[:2]) * 9/5 + 32, ndigits=1)
    return int(x[:2])
print(temp_to_f('100°C'))

# print cleaned data sample
display(dfwater.head())

dfwater['watertempavg'] = dfwater.watertemp.apply(lambda x: temp_to_f(x))
dfwater.drop('watertemp', 1, inplace=True)

# display the top 15 venues that have the closest watertemp to the pro cutoff temp
dfwater['pro_diff'] = abs(dfwater.watertempavg - 71.42)
dfwater = dfwater.sort_values(by='pro_diff', ascending = True)
dfwater['tempC'] = (dfwater.watertempavg - 32) * 5/9 
display(dfwater.head(60))

# save as csv
#dfwater.to_csv('~/Documents/dfwater(2).csv', index=False)


In [None]:
# Capitalize first element of each letter in the venuename and location
new_venuename = []
new_location = []
for index, row in dfwater.iterrows():
    venuename = row["venuename"]
    new_venuename.append(venuename.title())
    location = row["location"]
    new_location.append(location.split(",")[0].title())

dfwater["venuename"] = new_venuename
dfwater["location"] = new_location

# save as csv
dfwater.to_csv('~/Documents/dfwater(3).csv', index=False)


In [None]:
url = "https://www.roka.com/pages/average-ironman-water-temperatures"
response = requests.get(url)

html = response.content

soup = BeautifulSoup(html, 'lxml')
    
# find wetsuit data
wetsuit_data = soup.find_all("div", {'class': 'row_190425_water_temp_chart'})

race_data_lst = []
for idx, w in enumerate(wetsuit_data):
    race_data = w.find_all("div", {'class': 'line_190425_water_temp_chart'})
    
    if race_data[3].text.strip() == '|':
        suit2 = '/' + race_data[4].text
    else:
        suit2 = ''
    race_data_lst.append(pd.DataFrame(
        {'venuename': race_data[0].text,
        'watertemp': race_data[1].text,
        'suittype': (race_data[2].text + suit2)}, index=[idx]))

    
dfwater = pd.concat(race_data_lst, 0)
display(dfwater.head())


# lowercase the venue names and suit type names 
for v in ['venuename', 'suittype']:
    dfwater[v] = dfwater[v].apply(lambda x: x.lower())

    
# convert celsius to farenheight
def temp_to_f(x):
    '''
    Takes a temperature str "x" returns the numeric temp in F
    '''
    if x[-1] == "C":
        return round(int(x[:2]) * 9/5 + 32, ndigits=1)
    return int(x[:2])
print(temp_to_f('100°C'))

# print cleaned data sample
display(dfwater.head())

dfwater['watertempavg'] = dfwater.watertemp.apply(lambda x: temp_to_f(x))
dfwater.drop('watertemp', 1, inplace=True)

# display the top 15 venues that have the closest watertemp to the pro cutoff temp
dfwater['pro_diff'] = abs(dfwater.watertempavg - 71.42)
dfwater = dfwater.sort_values(by='pro_diff', ascending = True)
dfwater['tempC'] = (dfwater.watertempavg - 32) * 5/9 
display(dfwater.head(60))

# display the top 15 venues that have the closest watertemp to the amateur cutoff temp
dfwater['amateur_diff'] = abs(dfwater.watertempavg - 76.1)
dfwater = dfwater.sort_values(by='amateur_diff', ascending = True)
dfwater['tempC'] = (dfwater.watertempavg - 32) * 5/9 
display(dfwater.head(15))

# check data
print(len(dfwater.venuename.unique()))
print(dfwater.suittype.value_counts())

# plot avg water temp distribution
plt.hist(dfwater.watertempavg, bins=20)
# optional/illegal cutoff temp (>76.1 wetsuits not allowed)
plt.plot([76.1, 76.1], [0, 20])
# mandatory/optional cutoff temp (<60.8 wesuits mandatory)
plt.plot([60.8, 60.8], [0, 20])


# save as csv

#dfwater.to_csv('dir/dfwater.csv', index=False)

## Export html

Before committing:
1. Save nb as html
2. Clear nb of output (cell -> All output -> clear)
3. Save nb
4. Commit!

In [51]:
# save html with results

!jupyter nbconvert --output-dir='../jupyter_html/' --to html scrape_imdata.ipynb

[NbConvertApp] Converting notebook scrape_imdata.ipynb to html
[NbConvertApp] Writing 649467 bytes to ../jupyter_html/scrape_imdata.html
