# Clean data

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date
import pyarrow as pa
import pyarrow.parquet as pq

In [9]:
# load processed data
df = pq.read_table('processed.parquet').to_pandas()

### Exclude entries with shared name & gender and agegroups > 12 years apart

In [4]:
# Parkrun have been organized in Australia for 12 years and counting so a name/gender cannot be associated with more than 12 years across age groups
df['EXCLUDE_AGE'] = df.groupby(['name','gender'])['agegroup'].transform(lambda x: (int(x.max()[:2]) - int(x.min()[-2:])) > 12)

### Exclude parkruns with less than 100 runs

In [5]:
# Could make more strict to active parkruns only (merge with new data)
df['EXCLUDE_RUN'] = df.groupby('parkrun')['n'].transform(lambda x: x.sum()) < 100

### Other exclusions?
- i.e. more refined age exclusion with for instance one or more agegroups missing between the large gaps -> two people

## save cleaned data

In [8]:
df = df.query('not (EXCLUDE_AGE or EXCLUDE_RUN)').drop(['EXCLUDE_AGE','EXCLUDE_RUN'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1931708 entries, 0 to 2282436
Data columns (total 7 columns):
 #   Column    Dtype   
---  ------    -----   
 0   name      object  
 1   n         int64   
 2   parkrun   category
 3   state     category
 4   gender    category
 5   agegroup  category
 6   time      float64 
dtypes: category(4), float64(1), int64(1), object(1)
memory usage: 68.2+ MB


In [8]:
table = pa.Table.from_pandas(df, preserve_index = False)
pq.write_table(table, 'cleaned.parquet')