In [None]:
# REFACTORED VERSION - NO TIMESTAMP
import os
import json
import pickle
import glob
import numpy as np
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt

# Config
weather_pickle_files = glob.glob('race_details_for_year_with_visual_crossing_weather_*.pickle')
weather_bucket_size = 20  # Set to 20 for production
weather_grammar_base_file = "numerical_weather_conditions_grammar_and_inverted_indices"

In [None]:
def ppj(data):
    if isinstance(data, str):
        data = json.loads(data)
    print(json.dumps(data, indent=4, sort_keys=True))

In [None]:
def save_to_pickle(my_dict, filename_prefix):
    """
    Save the dictionary to a pickle file.
    """
    filename = f"{filename_prefix}.pickle"
    
    if os.path.dirname(filename):
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        pickle.dump(my_dict, f)
    
    print(f"Saved to: {filename}")
    return filename

In [None]:
def create_bins_with_equal_frequency(numbers, n_bins=20, bin_prefix="bin_"):
    rounded_numbers = [round(float(num)) for num in numbers if num is not None]
    distinct_values = len(set(rounded_numbers))
    if distinct_values < n_bins:
        n_bins = distinct_values  # Adjust if data is too sparse
    
    sorted_numbers = sorted(rounded_numbers)
    total_numbers = len(sorted_numbers)
    numbers_per_bin = total_numbers // n_bins
    remainder = total_numbers % n_bins
    
    bins_dict = {}
    inverted_index = {}
    start_idx = 0
    
    overall_min = min(sorted_numbers)
    overall_max = max(sorted_numbers)
    
    for i in range(n_bins):
        bin_size = numbers_per_bin + (1 if i < remainder else 0)
        end_idx = start_idx + bin_size
        bin_numbers = sorted_numbers[start_idx:end_idx]
        bin_label = f"{bin_prefix}{i}"
        
        if len(bin_numbers) == 0: continue
        
        bin_min = min(bin_numbers)
        bin_max = max(bin_numbers)
        
        bins_dict[bin_label] = {
            "start": bin_min,
            "end": bin_max,
            "width": bin_max - bin_min,
            "count": len(bin_numbers),
            "median": round(float(np.median(bin_numbers)), 2),
            "token": bin_label
        }
        start_idx = end_idx
    
    for value in range(overall_min, overall_max + 1):
        for i in range(n_bins):
            b = bins_dict[f"{bin_prefix}{i}"]
            if value <= b['end']:
                inverted_index[value] = f"{bin_prefix}{i}"
                break
        if value not in inverted_index: inverted_index[value] = f"{bin_prefix}{n_bins-1}"
            
    return {'bins': bins_dict, 'inverted_index': inverted_index}

In [None]:
def create_histogram(data, n_bins, title="Histogram", xlabel="Values", ylabel="Frequency"):
    if not data: 
        print(f"No data to plot for {title}")
        return
    plt.figure(figsize=(10, 6))
    plt.hist(data, bins=n_bins, alpha=0.7, edgecolor='black')
    plt.axvline(np.mean(data), color='red', linestyle='--', label=f'Mean: {np.mean(data):.2f}')
    plt.axvline(np.median(data), color='green', linestyle='-', label=f'Median: {np.median(data):.2f}')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
all_races_with_weather = {}
print(f"Found {len(weather_pickle_files)} files")
for file_path in sorted(weather_pickle_files):
    year = file_path.split('_')[-1].replace('.pickle', '')
    with open(file_path, 'rb') as f:
        all_races_with_weather[year] = pickle.load(f)
print(f"Loaded {len(all_races_with_weather)} years.")

In [None]:
numerical_weather_conditions = {}
keys_to_parse = ['humidity PCT', 'temperature F', 'feels like F', 'wind speed mph']

for key in keys_to_parse:
    values = []
    for year in all_races_with_weather:
        for race in all_races_with_weather[year]:
            w = all_races_with_weather[year][race].get('visual_crossing_weather')
            if w and w.get(key) is not None:
                values.append(w[key])
    
    # Plot distribution
    create_histogram(values, n_bins=20, title=f"Distribution of {key}", xlabel=key)
    
    token_prefix = key.replace(" ", "_").lower() + "_token_"
    bins = create_bins_with_equal_frequency(values, weather_bucket_size, token_prefix)
    numerical_weather_conditions[key] = bins

save_to_pickle(numerical_weather_conditions, weather_grammar_base_file)

In [None]:
def parse_and_count_conditions(conditions_list, delimiter=","):
    parsed_conditions = []
    original_to_parsed = {}
    
    for original in conditions_list:
        if original is None or not isinstance(original, str):
            cond_str = "Unknown"
        else:
            cond_str = original
            
        first_part = cond_str.split(delimiter)[0].strip().lower()
        token = "vc_conditions_" + first_part.replace(" ", "_")
        
        parsed_conditions.append(token)
        original_to_parsed[original] = token
    
    return {
        'stats': dict(Counter(parsed_conditions)),
        'original_to_parsed': original_to_parsed
    }

In [None]:
all_conditions = []
for year in all_races_with_weather:
    for race in all_races_with_weather[year]:
        w = all_races_with_weather[year][race].get('visual_crossing_weather')
        if w and 'conditions' in w:
            all_conditions.append(w['conditions'])

result = parse_and_count_conditions(all_conditions)
print("Unique conditions tokens:")
ppj(result['stats'])

save_to_pickle(result['original_to_parsed'], "weather_conditions_grammar_bins")