# UTA split scraping

In [116]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import sys

## Step 1: get the bib numbers

In [169]:
# grab the UTA100 results home page
URL = "https://uta.livetrail.run/classement.php?course=UTA100&cat=scratch"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'xml')

# find all the racer elements in the results table
results = soup.find_all(lambda tag: tag.name == 'c' and tag.has_attr('a3'))

# extract the bib number of each racer
bibs = []
for r in results:
    bibs.append(r['doss'])

print(bibs[:5])

['1723', '6', '89', '15', '2']


## Step 2: get the timing point information

In [170]:
# grab the results page for the first finisher
url = 'https://uta.livetrail.run/coureur.php?rech={:s}'.format(bibs[0])
page = requests.get(url)
soup = BeautifulSoup(page.content, 'xml')

# collect the details of each timing point
timing_points = soup.find_all('pt')
altitude = []
elev_gain = []
point_id = []
distance = []
point_name = []
for pt in timing_points:
    altitude.append(pt['a'])
    elev_gain.append(pt['d'])
    point_id.append(pt['idpt'])
    distance.append(pt['km'])
    point_name.append(pt['n'])
    
# form a DataFrame to organise the information
timing_points_df = pd.DataFrame(
    {'Name': point_name,
     'Distance (km)': np.float64(distance),
     'Altitude (m)': np.float64(altitude),
     'Elev. gain since start (m)': np.float64(elev_gain)},
    index=np.int32(point_id),
)
timing_points_df.index.name = 'ID'
timing_points_df.to_csv('../data/UTA100_2021_timing_points.csv', index=False)
timing_points_df

Unnamed: 0_level_0,Name,Distance (km),Altitude (m),Elev. gain since start (m)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,KCC OVAL,0.0,955.0,0.0
1,1km,1.0,984.0,58.0
3,3km,3.0,984.0,100.0
4,Narrow Neck,11.4,1024.0,690.0
8,Tarros Ladders,21.8,934.0,857.0
10,Foggy Knob,32.1,625.0,1090.0
12,Ironpot Ridge Turnaround,34.7,756.0,1253.0
14,Six Foot Track,45.7,576.0,1531.0
16,Katoomba Aquatic Centre,57.0,981.0,2038.0
18,Fairmont Resort Water Point,69.2,919.0,2990.0


## Step 3: get the splits

In [164]:
racers = []  # this list will hold all the data
i = 0
for b in bibs:
    sys.stdout.write(
        '\rProcessed {:d} of {:d} racers...          '.format(i, len(bibs))
    )
    
    # grab the results page for each racer
    url = 'https://uta.livetrail.run/coureur.php?rech={:s}'.format(b)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'xml')

    # collect the racer's information
    identity = soup.find('identite').attrs
    try:
        start_group = soup.find('divers2').contents[0]
    except IndexError:
        start_group = ''  # some runners don't have a group (?)
    
    identity = {**identity, 'Start Group': start_group}

    # find the racer's split data
    splits = soup.find_all(lambda tag: tag.name == 'e' and tag.has_attr('clt'))

    # collect the split data into lists
    place = []
    arrival_time = []
    departure_time = []
    point_id = []
    split_time = []
    for s in splits:
        place.append(s['clt'])

        try:
            arrival_time.append(s['ha'])
        except KeyError:
            arrival_time.append('')  # some split times are missed

        try:
            departure_time.append(s['hd'])  # only major checkpoints have this
        except KeyError:
            departure_time.append('')

        point_id.append(s['idpt'])
        split_time.append(s['tps'])

    # form a DataFrame to organise the data
    splits_df = pd.DataFrame(
        {'Split time': split_time,
         'Place': place,
         'Arrival time': arrival_time,
         'Departure time': departure_time},
        index=np.int32(point_id),
    )
    splits_df.index.name = 'ID'
    
    # condense the DataFrame into a single dict for each runner
    splits_dict = dict()
    for pt in timing_points_df.index:
        name = timing_points_df.loc[pt]['Name']
        try:
            split_time = splits_df.loc[pt]['Split time']
            place = splits_df.loc[pt]['Place']
            arrival_time = splits_df.loc[pt]['Arrival time']
            departure_time = splits_df.loc[pt]['Departure time']
        except KeyError:
            split_time = ''
            place = ''
            arrival_time = ''
            departure_time = ''
            
        splits_dict[name + ' split time'] = split_time
        splits_dict[name + ' place'] = place
        splits_dict[name + ' arrival time'] = arrival_time
        splits_dict[name + ' departure time'] = departure_time
    
    # output the runner's identity information and results in one dict
    racers.append({**identity, **splits_dict})
    i += 1
    
sys.stdout.write(
    '\rProcessed {:d} of {:d} racers...          '.format(i, len(bibs))
)
print('\nDone!')

Processed 1129 of 1129 racers...          
Done!


In [165]:
# tidy up and save the data
racers_df = pd.DataFrame(racers)
racers_df = racers_df.rename(columns={
    'nom': 'Surname',
    'prenom': 'First Name',
    'cat': 'Category',
    'sx': 'Gender',
    'descat': 'Age',
    'club': 'Club',
    'nat': 'Australia',
})
racers_df = racers_df.drop(columns=['cid', 'cio', 'a2', 'a3', 'ville', 'pays'])

def gender(code):
    if code == 'H':
        return 'Male'
    elif code == 'W':
        return 'Female'
    else:
        return 'Other'

racers_df['Gender'] = racers_df['Gender'].apply(gender)
racers_df.to_csv('../data/UTA100_2021_split_times.csv', index=False)
racers_df

Unnamed: 0,Surname,First Name,Category,Gender,Age,Club,Australia,Start Group,KCC OVAL split time,KCC OVAL place,...,Sewage Treatment Works arrival time,Sewage Treatment Works departure time,Base of Furber Steps split time,Base of Furber Steps place,Base of Furber Steps arrival time,Base of Furber Steps departure time,Scenic World split time,Scenic World place,Scenic World arrival time,Scenic World departure time
0,Pellow,Matthew,Vet M,Male,30-39,,Australia,Start Group 1,00:00:00,-,...,15:24:32,,09:36:57,1,15:56:57,,09:51:32,1,16:11:32,
1,Armstrong,Vajin,MasterM,Male,40-49,Sri Chinmoy Marathon Team,New Zealand,Start Group 1,00:00:00,-,...,15:32:12,,09:50:57,2,16:10:57,,10:04:28,2,16:24:28,
2,Dimuantes,Michael,OpenM,Male,18-29,UWA Running Club,Australia,Start Group 1,00:00:00,-,...,15:37:59,,09:56:58,3,16:16:58,,10:09:37,3,16:29:37,
3,Crehan,Matthew,Vet M,Male,30-39,La Sportiva,Australia,Start Group 1,00:00:00,-,...,15:47:00,,10:02:52,4,16:22:52,,10:16:08,4,16:36:08,
4,Davies,Brendan,MasterM,Male,40-49,UP Coaching,Australia,Start Group 1,00:00:00,-,...,15:50:38,,10:09:16,5,16:29:16,,10:24:07,5,16:44:07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,Hill,James,OpenM,Male,18-29,,Australia,Start Group 7,00:00:00,-,...,09:55:39,,27:16:54,1052,11:10:54,,27:47:33,1124,11:41:33,
1125,Mcinnes@,Rachel,OpenW,Other,18-29,,Australia,Start Group 7,00:00:00,-,...,09:50:51,,27:16:38,1050,11:10:38,,27:47:42,1126,11:41:42,
1126,Rutherford,Rodney,GrdMasM,Male,60-69,,Australia,Start Group 7,00:00:00,-,...,09:40:31,,27:22:38,1053,11:16:38,,27:55:59,1127,11:49:59,
1127,Reynolds,Stephen,GrdMasM,Male,60-69,,Australia,Start Group 7,00:00:00,-,...,09:56:15,,27:25:15,1054,11:19:15,,27:57:14,1128,11:51:14,
