<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Athlete-data" data-toc-modified-id="Athlete-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Athlete data</a></span></li><li><span><a href="#Wetsuit-data" data-toc-modified-id="Wetsuit-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Wetsuit data</a></span></li><li><span><a href="#Weather-and-water-temp-data" data-toc-modified-id="Weather-and-water-temp-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Weather and water temp data</a></span></li><li><span><a href="#Export-html" data-toc-modified-id="Export-html-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Export html</a></span></li></ul></div>

This code scrapes Ironman finisher data at the athlete level.

In [None]:
import pandas as pd
import numpy as np
import re
import json
from lxml import html
import requests
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', 100)

## Athlete data

In [None]:
# define variable names (in order as appear on webpage)

dfcolumns = ['athletename', 'athleteid', 'athleteurl', 'bib_n', 'gender', 'division', \
             'divurl', 'konadiv', 'seriesid', 'seriesurl', 'raceid', \
             'raceurl', 'racename', 'raceyear', 'racedate', 'racetype', \
             'konayear', 'konaslots', 'swimtime', 'swimrank', 'swimrankdiv', \
             'swimrankgender', 'biketime', 'bikerank', 'bikerankdiv', \
             'bikerankgender', 'runtime', 'runrank', 'runrankdiv', \
             'runrankgender', 'overalltime', 'overallrank', 'overallrankdiv', \
             'overallrankgender', 'konarankdiv', 'trans1time', 'trans1rank', \
             'trans1rankdiv', 'trans1rankgender', 'trans2time', 'trans2rank', \
             'trans2rankdiv', 'trans2rankgender', 'finish', 'autokonaqual']


# define scraping function

def get_ath_df(userid):
    """Returns athlete-race level dataframe
    """    
    print("Aquiring data for user: %d       " % userid, end='\r')
    
    # get web response
    urlbase = "https://www.coachcox.co.uk/imstats/athlete/"
    url = urlbase + str(userid)
    response = requests.get(url)
    html = response.content
        
    # translate to BeautifulSoup object
    soup = BeautifulSoup(html, 'lxml')
    
    # find athlete data
    s = soup.find("script", text=re.compile(r"var imathleteresultstable"))
    try:
        s = s.contents[0]
    except:
        # no data found, return null
        return None
    
    # within the JS script, find the array with relevant data
    start = 'var imathleteresultsdata = '
    end = ';'
    datastring = s[s.find(start)+len(start):s.find(end)]
    
    # get athlete name
    athletename = soup.find("title").contents[0]
    athletename = athletename[13:athletename.find(" Race Results")]

    # turn data into df and clean/label
    data = json.loads(datastring)
    dfath = pd.DataFrame(data)
    dfath.insert(loc=0, column='athletename', value=athletename)
    dfath.columns = dfcolumns
    dfath.loc[:,'racedate'] = dfath.racedate.apply(lambda x: x['d'])
    #dfath.loc[:,'racedate'] = dfath.racedate.apply(lambda x: dt.strptime(x['d'], "%d %b %Y"))

    return dfath 


# demonstrate use for one athlete

df = get_ath_df(184495)
df.head()

In [None]:
# get data for a few users and append to one df

df_array = []
nodata = []

for i in range(23786, 23786 + 10):
    dfath = get_ath_df(i)
    if isinstance(dfath, pd.DataFrame):
        df_array.append(dfath)
    else:
        nodata.append(i)
        
df_all = pd.concat(df_array, 0)
        
print("\ndone\n")
print("Total athletes captured: {}".format(len(df_all.athleteid.unique())))
print("Total athletes failed: {}".format(len(nodata)))

df_all.head()

## Wetsuit data

In [None]:
url = "https://www.roka.com/pages/average-ironman-water-temperatures"
response = requests.get(url)

html = response.content

soup = BeautifulSoup(html, 'lxml')
    
# find wetsuit data
wetsuit_data = soup.find_all("div", {'class': 'row_190425_water_temp_chart'})

race_data_lst = []
for idx, w in enumerate(wetsuit_data):
    race_data = w.find_all("div", {'class': 'line_190425_water_temp_chart'})
    if race_data[3].text.strip() == '|':
        suit2 = '/' + race_data[4].text
    else:
        suit2 = ''
    race_data_lst.append(pd.DataFrame(
        {'venuename': race_data[0].text,
        'watertemp': race_data[1].text,
        'suittype': (race_data[2].text + suit2)}, index=[idx]))
    
dfwater = pd.concat(race_data_lst, 0)
display(dfwater.head())


# lowercase the venue names and suit type names 
for v in ['venuename', 'suittype']:
    dfwater[v] = dfwater[v].apply(lambda x: x.lower())

    
# convert celsius to farenheight
def temp_to_f(x):
    '''
    Takes a temperature str "x" returns the numeric temp in F
    '''
    if x[-1] == "C":
        return round(int(x[:2]) * 9/5 + 32, ndigits=1)
    return int(x[:2])
print(temp_to_f('100°C'))

dfwater['watertempavg'] = dfwater.watertemp.apply(lambda x: temp_to_f(x))
dfwater.drop('watertemp', 1, inplace=True)


# print cleaned data sample
display(dfwater.head())


# check data
print(len(dfwater.venuename.unique()))
print(dfwater.suittype.value_counts())

# plot avg water temp distribution
plt.hist(dfwater.watertempavg, bins=20)
# optional/illegal cutoff temp (>76.1 wetsuits not allowed)
plt.plot([76.1, 76.1], [0, 20])
# mandatory/optional cutoff temp (<60.8 wesuits mandatory)
plt.plot([60.8, 60.8], [0, 20])


# save as csv

#dfwater.to_csv('dir/dfwater.csv', index=False)

## Weather and water temp data

In [None]:
# define Haversine formula function

from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lng1, lat2, lng2):
    """
    Calculate the surface distance between two points 
    on the earth from lat/lng pairs
    """
    # convert degrees to radians 
    lat1, lng1, lat2, lng2 = map(radians, [lat1, lng1, lat2, lng2])

    # haversine formula 
    dlng = lng2 - lng1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlng/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in km
    # r = 3956 # Radius of earth in miles
    return c * r

In [None]:
# import template file

df = pd.read_csv('../data/api_temp/race_weather_data.csv')
df.head()

In [None]:
# NOAA NCEI API request
# https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation

# San Diego lat/lng: 32.7157° N, 117.1611° W

import requests

def get_temp_data(lat, lng, date):
    """
    Queries the NOAA NCEI API to retrieve surface sea temperature data taken by
    a ship or buoy closest to (lat, lng) on date.
    """

    base = 'https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-marine&dataTypes=AIR_TEMP,WIND_SPEED,AMT_PRECIP,SEA_SURF_TEMP&format=json&boundingBox='

    # iteratively increase tol until only one station found
    tol = 0.5
    search_rate = 1.5 # each round update tol to be tol * search_rate
    ub = 5 # upper bound - stop searching if this level exceeded
    while True:
        # print current level for tol
        print(tol)

        # set bounding box bounds around swim start lat/lng pair
        nb = lat + tol
        wb = lng - tol # TODO should we multiply by 1/2? I think not
        sb = lat - tol
        eb = lng + tol

        # compose URL
        url = base + str(nb) + ',' + str(wb) + ',' + str(sb) + ',' + str(eb) + \
            '&startDate=' + startdate + '&endDate=' + enddate

        # request json object, convert to data frame object
        resp = requests.get(url=url)
        #print(resp.json())
        dftmp = pd.DataFrame(resp.json())
        dftmp.columns = [x.lower() for x in dftmp.columns]

        # keep only obs with temperature observed
        if len(dftmp) > 0:
            dftmp = dftmp.loc[dftmp.sea_surf_temp.notnull()]

        # case with no stations
        if len(dftmp) == 0:
            tol *= search_rate
            if tol > ub:
                print("Failed to find close enough station.")
                break

        # case with at least one observation
        else:
            # get closest buoy/ship
            dftmp['dist'] = dftmp.apply(lambda x: haversine(float(x.latitude), float(x.longitude), lat, lng), 1)
            dftmp = dftmp.loc[dftmp.dist == min(dftmp.dist), :]

            # in case multiple observations, take temp closest to 5AM (approx when water temp taken for races)
            dftmp['timediff'] = abs(pd.to_datetime(dftmp.date).apply(lambda x: x.hour + x.minute/60 - 5))
            dftmp = dftmp.loc[dftmp.timediff == min(dftmp.timediff), :]
            break

    return dftmp

# San Diego
dftmp = get_temp_data(32.716, -117.1611, '2017-12-20')
display(dftmp)

# Mar Del Plata
dftmp = get_temp_data(-38.005, -57.539, '2017-12-20')
display(dftmp)

# sea_surf_temp is degrees celsius * 10, I believe

## Export html

Before committing:
1. Save nb as html
2. Clear nb of output (cell -> All output -> clear)
3. Save nb
4. Commit!

In [None]:
# save html with results

!jupyter nbconvert --output-dir='../jupyter_html/' --to html scrape_imdata.ipynb