# Process data

In [1]:
import json
import os
import numpy as np
import pandas as pd
from datetime import date
from pandas.api.types import CategoricalDtype

import pyarrow as pa
import pyarrow.parquet as pq

## load raw data

In [2]:
with open('../data/states.json', 'r') as f:
    locations = json.load(f)
    
with open('../data/fullname.json', 'r') as f:
    fullname = json.load(f)

In [3]:
tmp = []

for run in os.listdir('../data'):
    if not run.endswith('.csv'):
        continue
    df = pd.read_csv(f'../data/{run}')
    parkrun = fullname[run[:-4]]
    df['parkrun'] = parkrun
    df['state'] = locations[parkrun]
    tmp.append(df)
    
tmp = pd.concat(tmp).reset_index(drop = True)

In [4]:
tmp.head()

Unnamed: 0,parkrunner,Number of Runs,Best Time,category,parkrun,state
0,Oliver PURCELL,7,19:28,JM10,Airlie Beach,QLD
1,Jack OBERG,65,20:35,JM10,Airlie Beach,QLD
2,Flynn PURCELL,6,20:44,JM10,Airlie Beach,QLD
3,William MIDDLETON,1,22:29,JM10,Airlie Beach,QLD
4,Banjo MOSER,2,23:10,JM10,Airlie Beach,QLD


## process combined data

In [5]:
def get_time(x):
    if type(x) != str:
        return np.nan
    x = x.split(':')
    if len(x) == 3:
        return int(x[0])*60 + int(x[1]) + int(x[2])/60
    elif len(x) == 2:
        return int(x[0]) + int(x[1])/60

In [6]:
df = (tmp
    .assign(
        gender = tmp.category.str.contains('M').map({True: 'M', False: 'F'}).astype('category'),
        agegroup = (tmp.category
                    .replace({'JM10': 'JM00-10', 'JW10':'JW00-10'})
                    .str[-5:]
                    .astype(CategoricalDtype(ordered = True))),
        time = tmp['Best Time'].apply(get_time)
    )
    .drop(['Best Time', 'category'], axis = 1)
    .rename({'Number of Runs': 'n', 'parkrunner': 'name'}, axis = 1)
    .astype({'n': int, 'parkrun': 'category', 'state': 'category'})
)

In [7]:
len(df)

2282437

In [8]:
table = pa.Table.from_pandas(df, preserve_index = False)
pq.write_table(table, '../data/processed.parquet')