In [1]:
# Script for extracting, converting, and saving results from the 2024 USA Marathon Olympic Trials

In [2]:
import requests
import json
import os

import pandas as pd
import numpy as np

from scipy.interpolate import interp1d

flag_save = False

# Extract Results

In [3]:
# Function for converting API response to a table
# Takes the raw response as input
# Assumes that the data is in the `list` item 

def extract_results_table(_txt):
    
    _json = json.loads(_txt)
    _df = pd.DataFrame(_json['list'])

    return _df

In [4]:
payload = {
    'timesort': '1',
    'nohide': '1',
    'checksum': '', 
    'appid': '65328519e78ff0366f242153',
    'token': '0CB822ADFDC6C77C4394',
    'max': '999',
    'catloc': '1',
    'cattotal': '1',
    'units': 'standard',
    'source': 'webtracker'
}


In [5]:
# Build list of URLs

url_list = []    

url_base = 'https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-{}-marathon/splits/{}M'

for g in ['men', 'women']:
    for i in range(1,27):
        url = url_base.format(g, i)
        url_list.append(url)

    url_list.append('https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-{}-marathon/splits/FINISH'.format(g))

# Go get results!
    
list_results = []

for url in url_list:
    print(url)
    response = requests.post(url, data=payload)
    response_text = response.text
    response_code = response.status_code

    if response_code == 200:
        df_split_result = extract_results_table(response_text)
        list_results.append(df_split_result)

    else:

        print(response_code)
        print(response_text)  # Prints the response body

df_results = pd.concat(list_results, axis=0)

https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/1M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/2M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/3M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/4M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/5M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/6M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/7M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/8M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/9M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/10M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-men-marathon/splits/11M
https://api.rtrt.me/events/ORLANDO-TRIALS-2024/categories/top-m

In [6]:
# Do some conversions

df_results['point'] = np.where(df_results['point']=='FINISH', '26.2M', df_results['point'])

df_results['distance'] = df_results['point'].str.slice(0, -1).astype(np.float64)

df_results['time_sec'] = pd.to_timedelta(df_results['time']).dt.total_seconds()
df_results['time_min'] = df_results['time_sec'] / 60.0

In [7]:
# Save raw results
if flag_save:
    print('Saving raw results')
    df_results.to_csv(os.path.join('..', 'public', 'data', 'result_raw_by_mile.csv'), index=False)

In [8]:
# Extract final placing for each runner, so we can attach it to the results

# Final placing
df_results_final_by_runner = df_results.loc[df_results['isFinish']=='1', ['name', 'sex', 'place', 'isFinish']]

# Get info on DNFs
df_results_last_marker = df_results.groupby(['name', 'sex'], as_index=False).agg({'distance':'max', 'time_min':'max'})

# Combine:

df_results_runner_detail = df_results_last_marker.merge(df_results_final_by_runner, on=['name', 'sex'], how='left')
# Set place to -1 for non-finishers
df_results_runner_detail['place'] = df_results_runner_detail['place'].fillna(-1)
df_results_runner_detail['flag_finished'] = df_results_runner_detail['isFinish'].fillna(0).astype(np.int32)

df_results_runner_detail = df_results_runner_detail.rename(columns={'distance':'last_distance', 'time_min':'last_time', 'place':'final_place'})
df_results_runner_detail = df_results_runner_detail[['name', 'sex', 'flag_finished', 'last_distance', 'last_time', 'final_place']]


In [9]:
# Append df_results_runner_detail to df_results:

df_results_full = df_results.merge(df_results_runner_detail, on=['name', 'sex'], how='left')


In [10]:
# pd.set_option('display.max_columns', None)
# df_results.sample(5)

flds = ['name', 'sex', 'bib_display', 'label', 'place', 'placeChange', 'isFinish', 'distance', 
        'time', 'time_sec', 'time_min', 'splitTime', 'netTime', 'waveTime', 'epochTime', 'timestamp', 
        'flag_finished', 'last_distance', 'last_time', 'final_place']

df_results_clean_men = df_results_full.loc[df_results_full['sex'] == 'M', flds]
df_results_clean_women = df_results_full.loc[df_results_full['sex'] == 'F', flds]
df_results_clean_all = df_results_full[[flds]]


KeyError: "None of [Index([('name', 'sex', 'bib_display', 'label', 'place', 'placeChange', 'isFinish', 'distance', 'time', 'time_sec', 'time_min', 'splitTime', 'netTime', 'waveTime', 'epochTime', 'timestamp', 'flag_finished', 'last_distance', 'last_time', 'final_place')], dtype='object')] are in the [columns]"

In [None]:
# Save the raw results by mile:
# flag_save=True

if flag_save:
    print('Saving results by mile')
    df_results_clean_men.to_csv(os.path.join('..', 'public', 'data', 'result_men_by_mile.csv'), index=False)
    df_results_clean_women.to_csv(os.path.join('..', 'public', 'data', 'result_women_by_mile.csv'), index=False)
    df_results_clean_women.to_csv(os.path.join('..', 'public', 'data', 'result_all_by_mile.csv'), index=False)

Saving results by mile


In [None]:
# Interpolate distances for fixed time intervals

In [None]:
def find_interpolated_distances(_df, new_x, fld_x='time_min', fld_y='distance'):
    '''
        _df: Dataframe of actual values; must contain the fields specified by: 
        fld_x: Name of field with x values
        fld_y: Name of field with y values
        new_x: List of new x values
    '''

    # We need to deal with non-finishers:
    #max_len = len(_df)
    new_x_trimmed = new_x #[:max_len]

    f = interp1d(_df[fld_x], _df[fld_y], kind='linear', bounds_error=False, fill_value="extrapolate")
    _estimated_distances = f(new_x_trimmed)
    
    # Let's make sure we cap things at the finish
    _estimated_distances = np.where(_estimated_distances > 26.2, 26.2, _estimated_distances)

    return _estimated_distances

In [None]:
def make_interpolated_results_table(_df, runner_name, time_values):
    '''
        _df: Table with fields including 'name', 'distance', 'time_min'
        runner_name: Name of runner that appears in the 'name' field
        time_values: List of time values for which we interpolate distance
    '''

    _df_sample = _df[_df['name'] == runner_name].copy().reset_index()

    # This is not elegant; it adds zero values so we have complete data from 0 to 26.2:
    _df_sample.loc[len(_df_sample)] = {'name': runner_name, 'distance': 0.0, 'time_min': 0.0}

    _distances = find_interpolated_distances(_df_sample, time_values, 'time_min', 'distance')

    # Make new df with results; trim the times for non-finishers

    _df_new = pd.DataFrame({'time_min': time_values[:len(_distances)], 'distance': _distances})
    _df_new['name'] = runner_name

    return _df_new

    

In [None]:
# Get list of runners

df_runners = df_results.groupby(['name', 'sex'], as_index=False).agg({'distance':'nunique', 'time_min':'max'})

In [None]:
# Loop through and estimate distances for specific time splits

step = 1.0

times = np.arange(0, 181, step).tolist()

list_df_distances = []

for i, r in df_runners.iterrows():
    print(r['name'], r['distance'])
    df_dist_tmp = make_interpolated_results_table(df_results, r['name'], times)
    #df_dist_tmp['gender'] = r['sex']
    df_dist_tmp['sex'] = r['sex']

    list_df_distances.append(df_dist_tmp)

df_distances = pd.concat(list_df_distances, axis=0)


Aaron Davidson 13
Aaron Gruen 18
Abdi Abdirahman 8
Abigail McNulty 27
Abinet Adraro 15
Adam Sjolund 27
Adam Wollant 27
Adrian Walsh 27
Afewerki Zeru 27
Aidan Reed 27
Alan Peterson 27
Alberto Mena 27
Alex Norstrom 27
Alexander Burks 27
Alexander Taylor 18
Alexandra Greitzer 27
Aliphine Tuliamuk 10
Allie Schaich 27
Alyssa Bloomquist 17
Amanda Phillips 27
Amber Zimmerman 27
Amelia Keyser-Gibson 27
Amy Davis 27
Andrea Pomaranski 27
Andrew Bowman 17
Andrew Colley 19
Andrew McCann 27
Ann Marie Pierce 27
Ann Marie Tuxbury 27
Anna West 16
Anne-Marie Blaney 27
Annie Frisbie 27
Annie Heffernan 27
Ariane Hendrix 27
Ashlee Powers 27
Austin Bogina 27
Ava Nuttall 27
Awet Beraki 27
Ben Olson 18
Benjamin Decker 27
Benjamin Kendell 27
Benjamin Payne 27
Benjamin Schneiderman 27
Betsy Saina 22
Billie Hatch 24
Biya Simbassa 27
Bradley Taylor 12
Breanna Sieracki 27
Brendan Martin 27
Bria Wetsch 27
Brian Harvey 27
Brian Masterson 19
Brian Shrader 17
Bridget Lyons Belyeu 27
Britney Romero 27
Brittney Feivor 

In [None]:
# Next, add  overall results

df_distances = df_distances.merge(df_results_runner_detail, on=['name', 'sex'], how='left')

# Handle cases where we've interpolated distances that are beyond the last known distance for non-finishers
df_distances['distance'] = np.minimum(df_distances['distance'], df_distances['last_distance'])


In [None]:
# Now compute the distance from 3rd 

def difference_from_nth_largest(group, n=3):
    # Find the 3rd largest distance in the group
    if len(group) >= n:
        nth_largest_distance = group['distance'].nlargest(n).iloc[-1]
    else:
        # If the group has less than 3 elements, use the smallest distance
        nth_largest_distance = group['distance'].min()
    
    # Compute the distance gap, assuming distance is miles!
    group['distance_diff_m'] = 1609.0 * (group['distance'] - nth_largest_distance)
    return group

# Apply the function to each group
df_distances = df_distances.groupby(['time_min', 'sex'], as_index=False).apply(difference_from_nth_largest, n=3)

df_distances['rank'] = df_distances.groupby(['time_min', 'sex'], as_index=False)['distance'].rank(method='min', ascending=False)


In [None]:
# Save the raw results by mile:

step_i = int(step)
if flag_save:
    print('Saving results by {} minute interval'.format(step_i))
    df_distances[df_distances['sex']=='M'].to_csv(os.path.join('..', 'public', 'data', 'result_men_by_{}min.csv'.format(step_i)), index=False)
    df_distances[df_distances['sex']=='F'].to_csv(os.path.join('..', 'public', 'data', 'result_women_by_{}min.csv'.format(step_i)), index=False)
    df_distances.to_csv(os.path.join('..', 'public', 'data', 'result_all_by_{}min.csv'.format(step_i)), index=False)


Saving results by 1 minute interval


In [None]:
df_distances.agg({'time_min':'max'})

time_min    180.0
dtype: float64

# Analysis


df_results.groupby('isFinish').agg({'name':'count'})

In [None]:
df_results_final_by_runner

Unnamed: 0,name,sex,place,isFinish
0,Conner Mantz,M,1,1
1,Clayton Young,M,2,1
2,Leonard Korir,M,3,1
3,Elkanah Kibet,M,4,1
4,CJ Albertson,M,5,1
...,...,...,...,...
112,Kir Selert Faraud,F,113,1
113,Ariane Hendrix,F,114,1
114,Johanna Butler,F,115,1
115,Tara Welling,F,116,1


In [None]:
df_results_last_marker.groupby(['sex', 'distance']).agg({'name':'nunique'})


Unnamed: 0_level_0,Unnamed: 1_level_0,name
sex,distance,Unnamed: 2_level_1
F,6.0,2
F,8.0,1
F,10.0,4
F,12.0,3
F,13.0,1
F,16.0,1
F,17.0,3
F,18.0,7
F,19.0,2
F,20.0,3


In [None]:
df_results_runner_detail

Unnamed: 0,name,sex,flag_finished,last_distance,last_time
0,Aaron Davidson,M,0,13.0,69.296833
1,Aaron Gruen,M,0,18.0,92.631000
2,Abdi Abdirahman,M,0,8.0,40.095500
3,Abigail McNulty,F,1,26.2,157.303500
4,Abinet Adraro,M,0,15.0,77.486833
...,...,...,...,...,...
343,Zachary Hine,M,1,26.2,148.116667
344,Zachary Ornelas,M,1,26.2,144.129833
345,Zachary Ripley,M,1,26.2,157.798333
346,Zachery Panning,M,1,26.2,130.825333
