# Process data

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date
from pandas.api.types import CategoricalDtype

## load raw data

In [None]:
tmp = []

for run in os.listdir('data'):
    tmp.append(pd.read_csv(f'data/{run}'))
    
tmp = pd.concat(tmp).reset_index(drop = True)

## process combined data

In [None]:
def get_time(x):
    if type(x) != str:
        return np.nan
    x = x.split(':')
    if len(x) == 3:
        return int(x[0])*60 + int(x[1]) + int(x[2])/60
    elif len(x) == 2:
        return int(x[0]) + int(x[1])/60

In [None]:
df = (tmp
    .assign(
        gender = tmp.category.str.contains('M').map({True: 'M', False: 'F'}).astype('category'),
        agegroup = (tmp.category
                    .replace({'JM10': 'JM00-10', 'JW10':'JW00-10'})
                    .str[-5:]
                    .astype(CategoricalDtype(ordered = True))),
        time = tmp['Best Time'].apply(get_time)
    )
    .drop(['Rank','Club', 'Best Time', 'category'], axis = 1)
    .rename({'Number of Runs': 'n', 'parkrunner': 'name'}, axis = 1)
    .astype({'n': int, 'parkrun': 'category', 'state': 'category'})
)

In [None]:
df.to_csv(f'processed_{date.today()}.csv', index = False)