In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import glob
import os
from collections import Counter

race_finishers_pickle_files = glob.glob('race_finishers_for_year_*.pickle')

# Production Params
num_bins = 200
max_width_seconds = 10
min_paces = 1000

# TOY NUMBERS (Optional: uncomment for testing)
# num_bins=10 
# max_width_seconds=100
# min_paces=10

In [None]:
def ppj(data):
    if isinstance(data, str): data = json.loads(data)
    print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
def save_to_pickle(my_dict, filename_prefix):
    filename = f"{filename_prefix}.pickle"
    if os.path.dirname(filename): os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f: pickle.dump(my_dict, f)
    print(f"Saved to: {filename}")
    return filename

In [None]:
all_race_finishers = {}
print(f"Found {len(race_finishers_pickle_files)} files")
for file_path in sorted(race_finishers_pickle_files):
    year = file_path.split('_')[-1].replace('.pickle', '')
    with open(file_path, 'rb') as f: all_race_finishers[year] = pickle.load(f)
    print(f"Loaded {year}")

In [None]:
def pace_to_seconds(pace_str):
    if not pace_str or not isinstance(pace_str, str): return None
    try:
        parts = pace_str.split(':')
        if len(parts) != 2: return None
        return int(parts[0]) * 60 + int(parts[1])
    except: return None

paces_in_seconds = []
for year in all_race_finishers:
    for race in all_race_finishers[year]:
        for finisher in all_race_finishers[year][race]:
            s = pace_to_seconds(finisher.get("pace"))
            if s: paces_in_seconds.append(s)
print(f"Collected {len(paces_in_seconds)} paces.")

In [None]:
def plot_paces(paces, title="Distribution of Race Paces"):
    plt.figure(figsize=(12, 6))
    plt.hist(paces, bins=100, alpha=0.7, color='skyblue', edgecolor='black')
    plt.axvline(np.mean(paces), color='red', linestyle='--', label=f'Mean: {np.mean(paces):.1f}s')
    plt.axvline(np.median(paces), color='green', linestyle='-', label=f'Median: {np.median(paces):.1f}s')
    plt.title(title)
    plt.xlabel('Pace (seconds)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

plot_paces(paces_in_seconds)

In [None]:
def create_adaptive_percentile_bins(paces_list, num_bins=200, max_width_seconds=10, min_paces=1000):
    valid_paces = sorted([p for p in paces_list if p is not None])
    n = len(valid_paces)
    
    # Initial equal-probability bins
    percentiles = np.linspace(0, 100, num_bins + 1)
    boundaries = np.percentile(valid_paces, percentiles)
    
    bins = {}
    for i in range(num_bins):
        start, end = boundaries[i], boundaries[i+1]
        # Get paces in this specific range to calculate statistics
        bin_paces = [p for p in valid_paces if start <= p <= end]
        
        bins[f"pace_{i}"] = {
            'start': float(start),
            'end': float(end),
            'count': len(bin_paces),
            'width': float(end - start),
            'median_pace': float(np.median(bin_paces)) if bin_paces else float((start + end) / 2),
            'token': f"pace_{i}"
        }
    return bins

adaptive_bins = create_adaptive_percentile_bins(paces_in_seconds, num_bins, max_width_seconds, min_paces)
print(f"Created {len(adaptive_bins)} bins.")

In [None]:
# Create pace to bin lookup
pace_to_bin_dict = {}
for token, info in adaptive_bins.items():
    for s in range(int(info['start']), int(info['end']) + 1):
        pace_to_bin_dict[s] = token

print(f"Lookup table has {len(pace_to_bin_dict)} entries.")

In [None]:
# Save the two essential files
# 1. pace_lookup.pickle: The grammar used by the model for de-tokenization and MAE calculation
save_to_pickle(adaptive_bins, "pace_lookup")

# 2. pace_to_bin_lookup_table.pickle: The lookup table used during dataset generation
save_to_pickle(pace_to_bin_dict, "pace_to_bin_lookup_table")