In [7]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
import os

race_details_pickle_files = glob.glob('race_details_for_year_with_visual_crossing_weather_*.pickle')


In [3]:
def ppj(data):
    """
    Takes a JSON object (dictionary or string) and prints it in a pretty format.
    
    Args:
    data (dict or str): The JSON object to be pretty printed.
    
    Returns:
    None
    """
    if isinstance(data, str):
        # If the input is a JSON string, parse it into a dictionary
        data = json.loads(data)
    
    # Pretty print the JSON with indentation
    print(json.dumps(data, indent=4, sort_keys=True))

In [4]:
import pickle

def save_to_pickle(my_dict, filename_prefix):
    """
    Save the dictionary to a pickle file.
    
    Args:
        my_dict (dict): The dictionary to save
        filename_prefix (str): Prefix for the filename
    
    Returns:
        str: The filename that was created
    """
    # Create filename
    filename = f"{filename_prefix}.pickle"
    
    # Save to pickle file
    if os.path.dirname(filename):
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        pickle.dump(my_dict, f)
    
    print(f"Dictionary saved to: {filename}")
    print(f"File contains {len(my_dict)} keys")
    
    return filename


In [6]:
import glob
import pickle

# Load all pickle files from race_finishers directory into a single dictionary
all_races_with_weather = {}

# Get all pickle files in the race_finishers directory

print(f"Found {len(race_details_pickle_files)} pickle files")

# Load each pickle file and merge into the main dictionary
for file_path in race_details_pickle_files:
    try:
        # Extract year from filename (e.g., "race_finishers_for_year_1970.pickle" -> "1970")
        year = file_path.split('_')[-1].replace('.pickle', '')
        
        # Load the pickle file
        with open(file_path, 'rb') as f:
            year_data = pickle.load(f)
        
        # Add to main dictionary with year as key
        all_races_with_weather[year] = year_data
        
        print(f"Loaded {year}: {len(year_data)} races")
        
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

print(f"\nTotal years loaded: {len(all_races_with_weather)}")
print(f"Years available: {sorted(all_races_with_weather.keys())}")

# Show structure of first year's data
if all_races_with_weather:
    first_year = sorted(all_races_with_weather.keys())[0]
    first_race = list(all_races_with_weather[first_year].keys())[0]
    print(f"\nExample race structure from {first_year}:")
    print(f"Race ID: {first_race}")
    ppj(all_races_with_weather[first_year][first_race])

Found 0 pickle files

Total years loaded: 0
Years available: []


In [None]:
# Create distance overrides dictionary
race_distance_overrides = {
    '870405': '12 kilometers',
    '870516': '2 miles',
    '871128a': 'DROP',
    '880410': '12 kilometers',
    '900331': '12 kilometers',
    '901124c': '6 kilometers',
    '940116': '10 miles',
    '940416a': '2 miles',
    '940913': '5 kilometers',
    '940913a': '5 kilometers',
    '950604': '4 miles',
    '960331': '6 kilometers',
    '971012': '5 kilometers',
    '990228': 'DROP',
    'a00920': '5 kilometers',
    'a11104a': 'DROP',
    'a20521': '5 kilometers',
    'a20622': '5 kilometers',
    'a30520': '5 kilometers',
    'a31012': 'half-marathon',
    'a40606': '5 kilometers',
    'a50605': '5 kilometers',
    'a60517': '5 kilometers',
    'a60604': '5 miles',
    'a60627': '5 kilometers',
    'JOG': '6 kilometers',
    'b11120d': 'DROP',
    'b11120e': 'DROP',
    'HM34': 'DROP',
    'SI34': 'DROP',
    'YCS34': 'DROP',
    'YCS342': 'DROP',
    'MYI': 'DROP',
    '16YI': 'DROP',
    'MINI-Girls': 'DROP',
    '18TCSTS15': '25 kilometers',
    '2018RMINI': 'DROP',
    '19RMINI': 'DROP',
    '19TSQ3': 'DROP',
    'R2R7': '5 kilometers',
    'RTR4': 'DROP',
    '21FOUNDER': '5 kilometers',
    '21PRIDE': '6 kilometers',
    'BRUN33': '5 kilometers',
    '22RMINI3': 'DROP',
    '25GM3': 'DROP'
}

distance_overrides = {
    # Distance name overrides (not race ID overrides)
    '0.93 miles': '1.5 kilometers',
    '1.4 kilometers': '1.5 kilometers', 
    '1.4 miles': '1.5 miles',
    '3 miles': '5 kilometers',
    '7.5 miles': '12 kilometers',
    '15 miles': '25 kilometers',
    '5.8 miles': '10 kilometers',
    '8 kilometers': '5 miles'
}


In [None]:
# Create race distance overrides dictionary (race ID based)
race_distance_overrides = {
    '870405': '12 kilometers',
    '870516': '2 miles',
    '871128a': 'DROP',
    '880410': '12 kilometers',
    '900331': '12 kilometers',
    '901124c': '6 kilometers',
    '940116': '10 miles',
    '940416a': '2 miles',
    '940913': '5 kilometers',
    '940913a': '5 kilometers',
    '950604': '4 miles',
    '960331': '6 kilometers',
    '971012': '5 kilometers',
    '990228': 'DROP',
    'a00920': '5 kilometers',
    'a11104a': 'DROP',
    'a20521': '5 kilometers',
    'a20622': '5 kilometers',
    'a30520': '5 kilometers',
    'a31012': 'half-marathon',
    'a40606': '5 kilometers',
    'a50605': '5 kilometers',
    'a60517': '5 kilometers',
    'a60604': '5 miles',
    'a60627': '5 kilometers',
    'JOG': '6 kilometers',
    'b11120d': 'DROP',
    'b11120e': 'DROP',
    'HM34': 'DROP',
    'SI34': 'DROP',
    'YCS34': 'DROP',
    'YCS342': 'DROP',
    'MYI': 'DROP',
    '16YI': 'DROP',
    'MINI-Girls': 'DROP',
    '18TCSTS15': '25 kilometers',
    '2018RMINI': 'DROP',
    '19RMINI': 'DROP',
    '19TSQ3': 'DROP',
    'R2R7': '5 kilometers',
    'RTR4': 'DROP',
    '21FOUNDER': '5 kilometers',
    '21PRIDE': '6 kilometers',
    'BRUN33': '5 kilometers',
    '22RMINI3': 'DROP',
    '25GM3': 'DROP',
    '22RBKH3': 'DROP'
}

# Create distance name overrides dictionary (distance name based)
distance_overrides = {
    '0.93 miles': '1.5 kilometers',
    '1.4 kilometers': '1.5 kilometers', 
    '1.4 miles': '1.5 miles',
    '3 miles': '5 kilometers',
    '7.5 miles': '12 kilometers',
    '15 miles': '25 kilometers',
    '5.8 miles': '10 kilometers',
    '8 kilometers': '5 miles'
}

# Create all_races_with_weather_distance_mapped with distanceNameMapped field
all_races_with_weather_distance_mapped = {}

races_processed = 0
races_dropped = 0
race_id_overrides_applied = 0
distance_name_overrides_applied = 0

for year in all_races_with_weather:
    all_races_with_weather_distance_mapped[year] = {}
    
    for race_id in all_races_with_weather[year]:
        race_data = all_races_with_weather[year][race_id].copy()
        
        # Get the original distance name
        original_distance = race_data.get('distanceName', '')
        distance_name_mapped = original_distance.lower()  # Default to lowercase
        
        # Check for race ID override first (higher priority)
        if race_id in race_distance_overrides:
            distance_name_mapped = race_distance_overrides[race_id]
            race_id_overrides_applied += 1
        # Check for distance name override (lower priority)
        elif original_distance.lower() in distance_overrides:
            distance_name_mapped = distance_overrides[original_distance.lower()]
            distance_name_overrides_applied += 1
        
        # Add the distanceNameMapped field and 
        race_data['distanceNameMapped'] = distance_name_mapped
        race_data['distanceNameMappedToken'] = "distance_name_token_" + distance_name_mapped.replace(".", "POINT").replace(" ", "_").replace("-", "_")
        
        
        # Add to the new dictionary
        if distance_name_mapped != "DROP":
            all_races_with_weather_distance_mapped[year][race_id] = race_data
        else:
            races_dropped += 1
    
        races_processed += 1

print(f"Created all_races_with_weather_distance_mapped:")
print(f"  Total races processed: {races_processed}")
print(f"  Races dropped: {races_dropped}")
print(f"  Races with race ID overrides applied: {race_id_overrides_applied}")
print(f"  Races with distance name overrides applied: {distance_name_overrides_applied}")
#ppj(all_races_with_weather_distance_mapped)





In [None]:
from collections import Counter

# Extract distanceName from every race and count them
distance_names = []

for year in all_races_with_weather_distance_mapped:
    for race_id in all_races_with_weather_distance_mapped[year]:
        race_data = all_races_with_weather_distance_mapped[year][race_id]
        if 'distanceNameMapped' in race_data:
            distance_names.append(race_data['distanceNameMapped'].lower())

# Count the frequency of each distance name
distance_counts = Counter(distance_names)

print(f"Total races with distanceNameMapped: {len(distance_names)}")
print(f"Unique distance names: {len(distance_counts)}")
print("\nDistance Name Counts:")
print("=" * 50)

# Sort by count (descending) then by name (ascending)
sorted_distances = sorted(distance_counts.items(), key=lambda x: (x[0], x[1]))

for distance_name, count in sorted_distances:
    print(f"{distance_name:>25}: {count:>4} races")

# Show some statistics
print(f"\nStatistics:")
print(f"  Total races: {len(distance_names)}")
print(f"  Unique distance types: {len(distance_counts)}")
print(f"  Most common distance: {distance_counts.most_common(1)[0][0]} ({distance_counts.most_common(1)[0][1]} races)")
print(f"  Least common distance: {distance_counts.most_common()[-1][0]} ({distance_counts.most_common()[-1][1]} races)")

# Find distance names with frequency of 2 or less
rare_distances = {name: count for name, count in distance_counts.items() if count <= 5}

print(f"\nRare Distance Types (frequency ≤ 5):")
print("=" * 60)
print(f"Found {len(rare_distances)} distance types with frequency ≤ 5")

# Print each race instance with rare distance names
print(f"\nIndividual Race Instances with Rare Distance Types:")
print("=" * 60)


for year in sorted(all_races_with_weather_distance_mapped.keys()):
    for race_id in sorted(all_races_with_weather_distance_mapped[year].keys()):
        race_data = all_races_with_weather_distance_mapped[year][race_id]
        if 'distanceNameMapped' in race_data:
            distance_name = race_data['distanceNameMapped']
            event_name = race_data['eventName']
            if distance_name in rare_distances:
                print(f"Year: {year}, Race ID: {race_id}, Name: {event_name}, Distance: '{distance_name}' (frequency: {rare_distances[distance_name]})")



In [None]:
save_to_pickle(all_races_with_weather_distance_mapped, "full_race_details_with_mapped_tokens_")


In [None]:
ppj(pickle.load(open("full_race_details_with_mapped_tokens_2025-10-07_16:38.pickle", "rb"))["2017"])