In [None]:
import pickle
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from datetime import datetime

# --- CONFIGURATION ---
# Input Filenames
race_details_input = "race_specific_simplified_grammar.pickle"
pace_to_token_input = "pace_to_bin_lookup_table.pickle"
runner_career_input = "runners_grouped_by_name_gender_age_18_and_up_dropping_overlapping_groups_with_3_or_more_consecutive_birth_years.pickle"

# Output Filename
cleaned_runner_data_output = "runner_data_with_full_grammar_token_and_race_details_pared_down"

@dataclass
class RaceData:
    race_id: str
    distance_token: str
    vc_conditions_token: str
    vc_humidity_token: str
    vc_temperature_token: str
    vc_feels_like_token: str
    vc_wind_speed_token: str
    start_date_time: datetime

In [None]:
def save_to_pickle(my_dict, filename_prefix):
    filename = f"{filename_prefix}.pickle"
    if os.path.dirname(filename): os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f: pickle.dump(my_dict, f)
    print(f"Saved to: {filename}")
    return filename

def pace_to_seconds(pace_str):
    if not pace_str or ":" not in pace_str: return None
    try:
        m, s = map(int, pace_str.split(":"))
        return m * 60 + s
    except: return None

def ppj(data):
    if isinstance(data, str): data = json.loads(data)
    print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
print("Loading data...")
with open(race_details_input, "rb") as f: race_details = pickle.load(f)
with open(pace_to_token_input, "rb") as f: pace_to_token = pickle.load(f)
with open(runner_career_input, "rb") as f: runner_data = pickle.load(f)
print(f"Loaded {len(race_details)} races, {len(pace_to_token)} pace tokens, and {len(runner_data)} runners.")

In [None]:
missing_race_ids = set()
cleaned_data = {}
culled_data = 0

for key, records in runner_data.items():
    new_records = []
    for race in records:
        pace_sec = pace_to_seconds(race['pace'])
        if pace_sec is None or pace_sec not in pace_to_token: continue
        
        race_id = race['raceId']
        if race_id not in race_details:
            missing_race_ids.add(race_id)
            continue
            
        new_race = {
            'age': int(race['age']),
            'age_token': f"age_{int(race['age'])}",
            'gender': race['gender'],
            'gender_token': f"gender_{race['gender'].lower()}",
            'pace': race['pace'],
            'paceToken': pace_to_token[pace_sec],
            'birthYear': race['birthYear'],
            'raceDetails': race_details[race_id]
        }
        new_records.append(new_race)
        
    if len(new_records) > 1:
        cleaned_data[key] = new_records
    else:
        culled_data += 1

print(f"Processing complete. Cleaned {len(cleaned_data)} runners. Culled {culled_data} runners.")
if missing_race_ids: print(f"Missing {len(missing_race_ids)} race IDs in grammar.")

In [None]:
all_ages = [r['age'] for recs in cleaned_data.values() for r in recs]
if all_ages:
    plt.figure(figsize=(12, 6))
    plt.hist(all_ages, bins=range(min(all_ages), max(all_ages) + 2), alpha=0.7, edgecolor='black')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.title('Distribution of Ages in Cleaned Dataset')
    plt.show()

save_to_pickle(cleaned_data, cleaned_runner_data_output)

In [None]:
# Example verification
if cleaned_data:
    sample_key = list(cleaned_data.keys())[0]
    print(f"Sample data for runner: {sample_key}")
    for race in cleaned_data[sample_key]:
        print(race)