<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-and-explore-data" data-toc-modified-id="Import-and-explore-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import and explore data</a></span><ul class="toc-item"><li><span><a href="#Load-race-specific-variables" data-toc-modified-id="Load-race-specific-variables-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load race-specific variables</a></span></li><li><span><a href="#Set-up-RD" data-toc-modified-id="Set-up-RD-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Set up RD</a></span></li></ul></li><li><span><a href="#Effect-of-wetsuits" data-toc-modified-id="Effect-of-wetsuits-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Effect of wetsuits</a></span></li><li><span><a href="#Export-html" data-toc-modified-id="Export-html-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Export html</a></span></li></ul></div>

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
import matplotlib.pyplot as plt

## Import and explore data

In [None]:
df = pd.read_csv('../data/dfm_20201022.csv')
print(df.isnull().sum())
df.head()

In [None]:
# integer vars

for v in ['seriesid', 'raceid', 'raceyear', 'bib_n', 'autokonaqual']:
    df[v] = np.array(df[v], dtype='int')

In [None]:
# examine finishers by year
display(pd.crosstab(df.raceyear, df.finish, margins='all'))

# examine percent finishers by year
display(pd.crosstab(df.raceyear, df.finish, normalize='index'))

In [None]:
# number of race-years
print(len(df.racename.unique()))
print(len(df[['racename', 'racedate']].drop_duplicates()))

# number of races by year
display(df[['racename', 'raceyear']].drop_duplicates().raceyear.value_counts().sort_index())

# number of years by race venue
df['venuename'] = df.racename.apply(lambda x: x[8:-5])
display(df[['venuename', 'raceyear']].drop_duplicates().venuename.value_counts())

In [None]:
# clean swim times: set to zero if swim cancelled

# weird circumstances:
# Hamburg 2018: swim replaced with run

noswim_races =  ['Ironman Chattanooga 2018',
                 'Ironman Florida 2014',
                 'Ironman Hamburg 2018',
                 'Ironman Ireland 2019',
                 'Ironman Louisville 2019',
                 'Ironman Maryland 2016',
                 'Ironman Western Australia 2017',
                 'Ironman New Zealand 2006']

df['noswim'] = np.array(df.racename.isin(noswim_races), dtype='int')
print(df.noswim.value_counts() / len(df))


# if swim cancelled, set swim time to 0 instead of missing
df.loc[df.noswim == 1, 'swimtime'] = 0
df.loc[df.noswim == 1, 'trans1time'] = 0

In [None]:
# gen variable to designate races that were shortened or otherwise adjusted

short_races = noswim_races + [\
                'Ironman Argentina 2018', # swim shortened
                "Ironman Coeur d'Alene 2007", # swim optional, though not for WC qualification
                'Ironman Cozumel 2013', # swim shortened
                'Ironman France 2019', # bike and run shortened b/c heat
                'Ironman Lake Placid 2014', # swim shortened for amateurs only
                'Ironman Louisville 2018', # swim shortened
                'Ironman Lake Placid 2014', # swim shortened for 1/3 of amateurs
                'Ironman Maryland 2015', # swim shortened
                'Ironman Maryland 2016', # bike course shortened, swim cancelled
                'Ironman Melbourne 2013', # swim shortened
                'Ironman New Zealand 2006', # bike and run halved, swim cancelled
                'Ironman New Zealand 2012', # all three events halved
                'Ironman North Carolina 2016', # bike shortened
                'Ironman South Africa 2019', # swim shortened
                'Ironman Taiwan 2018', # swim shortened
                'Ironman Texas 2016', # bike shortened
                'Ironman Western Australia 2017'] # bike shortened for slower athletes only

# other weird stuff:
# Cozumel 2017/9, Louisville 2014, New York 2012 had irregularly fast currents during swim
# Texas 2017/8 bike short by 2 miles
# Florida 2018: Hurricane Michael caused change in data and venue, field size less than half of registrants
# St George 2012: Strong winds made for a particularly challenging swim, lots of DNFs
# Frankfurt 2019, Chattanooga 2016/9, etc: Super hot, strong influence on DNFs, particularly during run
                                
df['short'] = np.array(df.racename.isin(short_races), dtype='int')
print(df.short.value_counts() / len(df))

In [None]:
# check swim finishers per race
dfsub = df.loc[df.short == 0, ['racename', 'venuename', 'swimtime']]
display(pd.crosstab(dfsub.venuename, dfsub.swimtime > 0, normalize='index').sort_values(1))

# check a few of the offenders:
venuenames = ['St George', 
               'New York', 
               'Argentina', 
               'Florida', 
               'Vineman']
dfsub = dfsub.loc[dfsub.venuename.isin(venuenames), :]
display(pd.crosstab(dfsub.racename, dfsub.swimtime > 0, normalize='index').sort_values(1))

# check the worst races for DNFs
dfsub = df.loc[df.finish.isin(['DNF', 'FIN']), ['racename', 'finish']]
display(pd.crosstab(dfsub.racename, dfsub.finish, normalize='index').sort_values('FIN').head(20))

# Race temperature and winds likely affect performance enough that controlling for weather may
#  improve statistical power...
# When we pull water temperature data may be useful to get other weather data too.

In [None]:
# fix kona year and dates for a few races
df.loc[df.racename == "Ironman St George 2011", 'konayear'] = 2011
df.loc[df.racename == "Ironman St George 2010", 'konayear'] = 2010
df.loc[df.racename == "Ironman Regensburg 2011", 'konayear'] = 2011
df.loc[df.racename == "Ironman Malaysia 2006", 'konayear'] = 2006
df.loc[df.racename == "Ironman Malaysia 2007", 'konayear'] = 2007
df.loc[df.racename == "Ironman China 2009", 'konayear'] = 2009
df.loc[df.racename == "Ironman China 2010", 'konayear'] = 2010
df.loc[df.racename == "Ironman China 2010", 'racedate'] = "14 Mar 2010"
df.loc[df.racename == "Ironman China 2009", 'racedate'] = "19 Apr 2009"
df.loc[df.racename == "Ironman Malaysia 2006", 'racedate'] = "26 Feb 2006"
df.loc[df.racename == "Ironman Malaysia 2007", 'racedate'] = "24 Feb 2007"

df['konayear'] = np.array(df.konayear, dtype='int')
print(df.konayear.isnull().sum())
print(df.konayear.value_counts().sort_index())

### Load race-specific variables

In [None]:
# load df with slots per ag per race
dfs = pd.read_csv('../data/dfslot_full_wide.csv')
dfs.head()

In [None]:
# merge race-specific vars, like # finishers, % finishers

dfrace = df.copy()

# get dnf rates by sport - only count one sport for DNF
dfrace['swimdnf'] = np.array((dfrace.swimtime.isnull()) & (dfrace.finish == "DNF"), dtype='int')
dfrace['bikednf'] = np.array((dfrace.biketime.isnull()) & (dfrace.finish == "DNF") & \
                             (dfrace.swimdnf == 0), dtype='int')
dfrace['rundnf'] = np.array((dfrace.runtime.isnull()) & (dfrace.finish == "DNF") & \
                            (dfrace.bikednf + dfrace.swimdnf == 0), dtype='int')
dfrace['dnf'] = np.array((dfrace.finish == "DNF"), dtype='int')
dfrace['dq'] = np.array((dfrace.finish == "DQ"), dtype='int')

# get participant and finisher counts
dfrace['participants'] = np.array(dfrace.finish.notnull(), dtype='int')
dfrace['finishers'] = np.array(dfrace.finish == "FIN", dtype='int')
dfrace.participants = dfrace.groupby('racename')['participants'].transform('sum')
dfrace.finishers = dfrace.groupby('racename')['finishers'].transform('sum')

# gender split
dfrace['female'] = np.array(dfrace.gender == 'Female', dtype='int')

# aggregate to race-level
dfrace = dfrace.groupby(['racename', 'venuename', 'seriesid', 'raceyear', \
                         'racedate', 'konayear'],  as_index=False, observed=False)\
                    [['noswim', 'short', 'swimdnf', 'bikednf', \
                      'rundnf', 'dnf', 'dq', 'participants', \
                      'finishers', 'female']].aggregate(np.mean)

# integer vars
for i in ['noswim', 'short', 'participants', 'finishers']:
    dfrace[i] = np.array(dfrace[i], dtype='int')

dfrace.head()

In [None]:
# TODO: fix this inconsistency
df.loc[df.seriesid == 54].racename.value_counts()

### Set up RD

In [None]:
# add cutoff times to dfs
divlist = ['F18-24', 'F25-29', 'F30-34', 'F35-39', 'F40-44', 'F45-49', 'F50-54', 
           'F55-59', 'F60-64', 'F65-69', 'F70-74', 'F75-79', 'F80+', 'FPC', 'FPRO', 
           'M18-24', 'M25-29', 'M30-34', 'M35-39', 'M40-44', 'M45-49', 'M50-54', 
           'M55-59', 'M60-64', 'M65-69', 'M70-74', 'M75-79', 'M80+', 'M80-84', 'MPC', 'MPRO']
racelist = df.racename.unique()

## Effect of wetsuits

## Export html

Before committing:
1. Save nb as html
2. Clear nb of output (cell -> All output -> clear)
3. Save nb
4. Commit!

In [None]:
!jupyter nbconvert --output-dir='../jupyter_html/' --to html analyze_imdata.ipynb