# Process data

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date

from pandas.api.types import CategoricalDtype

## load processed data

In [None]:
datafile = f'processed_{date.today()}.csv'

# check if datafile is most up-to-date
[print('not using the latest data') for file in os.listdir() if (file.startswith('combined') & (datafile < file))]

df = pd.read_csv(datafile).astype({'n': int, 'parkrun': 'category', 'state': 'category', 'gender': 'category', 'agegroup': CategoricalDtype(ordered=True)})

## clean data

### Exclude entries with shared name & gender and agegroups > 12 years apart

In [None]:
# Parkrun have been organized in Australia for 12 years and counting so a name/gender cannot be associated with more than 12 years across age groups
df['EXCLUDE_AGE'] = df.groupby(['name','gender'])['agegroup'].transform(lambda x: (int(x.max()[:2]) - int(x.min()[-2:])) > 12)

### Exclude parkruns with less than 100 runs

In [None]:
# Could make more strict to active parkruns only (merge with new data)
df['EXCLUDE_RUN'] = df.groupby('parkrun')['n'].transform(lambda x: x.sum()) < 100

### Other exclusions?
- i.e. more refined age exclusion with for instance one or more agegroups missing between the large gaps -> two people

## save cleaned data

In [None]:
df.query('not (EXCLUDE_AGE or EXCLUDE_RUN)').drop(['EXCLUDE_AGE','EXCLUDE_RUN'], axis = 1).to_csv(f'cleaned_{date.today()}.csv', index = False)