In [None]:
from dataclasses import dataclass
from datetime import datetime
import pickle
import os
import glob

# --- CONFIGURATION ---
# Input Filenames (from notebooks 04 and 05)
race_details_input = "full_race_details_with_mapped_tokens.pickle"
weather_categorical_input = "weather_conditions_grammar_bins.pickle"
weather_numerical_input = "numerical_weather_conditions_grammar_and_inverted_indices.pickle"

# Finisher Data Glob
race_finisher_pickle_files = glob.glob("race_finishers_for_year_*.pickle")

# Output Filename
unified_grammar_output = "race_specific_simplified_grammar"

@dataclass
class RaceData:
    race_id: str
    distance_token: str
    vc_conditions_token: str
    vc_humidity_token: str
    vc_temperature_token: str
    vc_feels_like_token: str
    vc_wind_speed_token: str
    start_date_time: datetime

In [None]:
def save_to_pickle(my_dict, filename_prefix):
    filename = f"{filename_prefix}.pickle"
    if os.path.dirname(filename): os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f: pickle.dump(my_dict, f)
    print(f"Saved to: {filename}")
    return filename

def ppj(data):
    import json
    if isinstance(data, str): data = json.loads(data)
    print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
print("Loading grammar components...")
with open(race_details_input, "rb") as f: full_race_details = pickle.load(f)
with open(weather_categorical_input, "rb") as f: weather_conditions = pickle.load(f)
with open(weather_numerical_input, "rb") as f: numerical_weather_conditions = pickle.load(f)
print(f"Loaded {len(full_race_details)} years of race data.")

In [None]:
def parse_datetime(dt_string):
    if isinstance(dt_string, datetime): return dt_string
    return datetime.strptime(dt_string, "%Y-%m-%dT%H:%M:%S")

race_specific_simplified_grammar = {}
for year in full_race_details:
    for race_id in full_race_details[year]:
        try:
            rd = full_race_details[year][race_id]
            w = rd["visual_crossing_weather"]
            
            race_obj = RaceData(
                race_id=race_id,
                distance_token=rd["distanceNameMappedToken"],
                vc_conditions_token=weather_conditions[w["conditions"]],
                vc_humidity_token=numerical_weather_conditions["humidity PCT"]["inverted_index"][int(round(w["humidity PCT"]))],
                vc_temperature_token=numerical_weather_conditions["temperature F"]["inverted_index"][int(round(w["temperature F"]))],
                vc_feels_like_token=numerical_weather_conditions["feels like F"]["inverted_index"][int(round(w["feels like F"]))],
                vc_wind_speed_token=numerical_weather_conditions["wind speed mph"]["inverted_index"][int(round(w["wind speed mph"]))],
                start_date_time=parse_datetime(rd["startDateTime"])
            )
            race_specific_simplified_grammar[race_id] = race_obj
        except Exception as e: continue

print(f"Processed {len(race_specific_simplified_grammar)} races.")

In [None]:
save_to_pickle(race_specific_simplified_grammar, unified_grammar_output)

In [None]:
# Load finisher data for verification
race_finishers = {}
if race_finisher_pickle_files:
    print(f"Found {len(race_finisher_pickle_files)} finisher files.")
    for file_path in sorted(race_finisher_pickle_files):
        year = os.path.basename(file_path).replace("race_finishers_for_year_", "").replace(".pickle", "")
        with open(file_path, "rb") as f:
            race_finishers[year] = pickle.load(f)
        print(f"Loaded {year}: {len(race_finishers[year])} races")
    
    # Show sample from the first year
    if race_finishers:
        first_year = sorted(race_finishers.keys())[0]
        if race_finishers[first_year]:
            sample_race_id = list(race_finishers[first_year].keys())[0]
            print(f"\nSample finisher data for {sample_race_id} ({first_year}):")
            ppj(race_finishers[first_year][sample_race_id][0])