In [13]:
# --- Imports & Configuration ---
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
DATA_PATH = 'data/'  # Folder where you unzipped the Kaggle dataset
START_YEAR = 2000    # We focus on the modern era for data consistency

In [14]:
# --- Data Loading ---
def load_data():
    print("Loading Vopani Dataset CSVs...")
    # Load core files with '\N' handling for nulls
    races = pd.read_csv(f'{DATA_PATH}races.csv', na_values='\\N')
    results = pd.read_csv(f'{DATA_PATH}results.csv', na_values='\\N')
    status = pd.read_csv(f'{DATA_PATH}status.csv', na_values='\\N')
    qualifying = pd.read_csv(f'{DATA_PATH}qualifying.csv', na_values='\\N')
    standings = pd.read_csv(f'{DATA_PATH}driver_standings.csv', na_values='\\N')
    circuits = pd.read_csv(f'{DATA_PATH}circuits.csv', na_values='\\N')
    lap_times = pd.read_csv(f'{DATA_PATH}lap_times.csv', na_values='\\N')
    weather = pd.read_csv(f'{DATA_PATH}weather.csv', na_values='\\N')
    print("Datasets are loaded in...")

    return races, results, status, qualifying, standings, circuits, lap_times, weather

# Load the raw dataframes
races_df, results_df, status_df, qual_df, stand_df, circuits_df, laps_df, weather_df = load_data()

Loading Vopani Dataset CSVs...
Datasets are loaded in...


In [15]:
# --- Cleaning & Weather Merging ---
def clean_and_merge_weather(races, weather):
    print("Processing & Merging Weather Data...")
    
    # 1. Extract Year from Weather Date
    weather['datetime'] = pd.to_datetime(weather['datetime'])
    weather['year'] = weather['datetime'].dt.year
    
    # 2. Create "Is Rain" Feature
    # Logic: If precipitation > 0.0, the track is wet.
    weather['is_rain'] = (weather['precipitation'] > 0).astype(int)

    # 3. Merge Weather with Official Race List
    # We join on ['year', 'round'] to align the weather with the correct Race ID
    merged_races = pd.merge(
        races, 
        weather[['year', 'round', 'temperature', 'precipitation', 'is_rain']], 
        on=['year', 'round'], 
        how='left'
    )
    
    # 4. Filter for Modern Era (2000+)
    merged_races = merged_races[merged_races['year'] >= START_YEAR].copy()
    
    # 5. Handle Missing Weather (Missing)
    # We fill with defaults (Dry, 20Â°C) so the model doesn't crash
    merged_races['precipitation'] = merged_races['precipitation'].fillna(0)
    merged_races['temperature'] = merged_races['temperature'].fillna(20.0)
    merged_races['is_rain'] = merged_races['is_rain'].fillna(0).astype(int)

    drop_cols = [
        'url', 
        'fp1_date', 'fp1_time', 
        'fp2_date', 'fp2_time', 
        'fp3_date', 'fp3_time', 
        'quali_date', 'quali_time', 
        'sprint_date', 'sprint_time'
    ]
    
    # Only drop columns that actually exist in the dataframe to avoid errors
    cols_to_drop = [c for c in drop_cols if c in merged_races.columns]
    
    df_clean = merged_races.drop(columns=cols_to_drop)
    
    print(f"Total Races processed: {merged_races.shape[0]}")
    return df_clean

# Execute Cleaning
main_races = clean_and_merge_weather(races_df, weather_df)
# Apply the cleaning

# Verify what's left
print("Columns kept:", main_races.columns.tolist())
main_races.head()

Processing & Merging Weather Data...
Total Races processed: 479
Columns kept: ['raceId', 'year', 'round', 'circuitId', 'name', 'date', 'time', 'temperature', 'precipitation', 'is_rain']


Unnamed: 0,raceId,year,round,circuitId,name,date,time,temperature,precipitation,is_rain
0,1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,23.0,0.0,0
1,2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,26.4,4.2,1
2,3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,19.2,0.9,1
3,4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,30.1,0.0,0
4,5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,21.3,0.0,0


In [None]:
# --- Generating Labels (Y) ---
def generate_labels(races, results, status, laps):
    print("Calculating Excitement Labels...")
    
    # counts of unique race id
    target_race_ids = races['raceId'].unique()
    
    # --- A. Calculate CHAOS Score (DNFs) ---
    # Chaos is a metric determined by how drivers finished a race out of the participating drivers
    # The drivers that finished a race, the more likely that the chaos score is lowered

    # Copy from results df to get filtered unique race ids
    results_filtered = results[results['raceId'].isin(target_race_ids)].copy()
    
    # Status IDs 1, 11-19 represent normal finishes. Everything else is a DNF.
    # In status.csv, ids 1 is considered a normal finish, ids 11-19 refer to how many laps the leader lapped this driver
    # The # of laps that the driver was lapped is id - 10 (e.g 11-10 = +1 laps)
    # We will consider all types of finishes as a safe finish
    normal_finish_ids = [1] + list(range(11, 20))

    # Create a new dataframe column name 'is_dnf' 
    # Checks if drivers in results_filtered had a normal finish
    # If true, the tilde inverts (NOT operator) to false and vice versa
    results_filtered['is_dnf'] = ~results_filtered['statusId'].isin(normal_finish_ids)
    
    # Group results_filtered by the race and sum everything in the 'is_dnf' column
    # Rename the column of the resulting dataframe into a chaos score
    chaos_df = results_filtered.groupby('raceId')['is_dnf'].sum().reset_index()
    chaos_df.rename(columns={'is_dnf': 'chaos_score'}, inplace=True)
    
    # --- B. Calculate ACTION Score (Overtakes/Position Changes) ---
    laps_filtered = laps[laps['raceId'].isin(target_race_ids)].copy()
    laps_filtered.sort_values(['raceId', 'driverId', 'lap'], inplace=True)
    
    # Calculate position change from previous lap
    laps_filtered['pos_change'] = laps_filtered.groupby(['raceId', 'driverId'])['position'].diff().abs()
    action_df = laps_filtered.groupby('raceId')['pos_change'].sum().reset_index()
    action_df.rename(columns={'pos_change': 'action_score'}, inplace=True)
    
    # --- C. Create Final Label ---
    metrics = races[['raceId']].merge(chaos_df, on='raceId', how='left')
    metrics = metrics.merge(action_df, on='raceId', how='left').fillna(0)
    
    # Define "Exciting" as Top 25% in Chaos OR Top 25% in Action
    chaos_thresh = metrics['chaos_score'].quantile(0.75)
    action_thresh = metrics['action_score'].quantile(0.75)
    
    metrics['is_exciting'] = (
        (metrics['chaos_score'] > chaos_thresh) | 
        (metrics['action_score'] > action_thresh)
    ).astype(int)
    
    print(f"Thresholds -> Chaos: >{chaos_thresh} DNFs, Action: >{action_thresh} Changes")
    return metrics[['raceId', 'is_exciting']]

# Execute Label Generation
labels_df = generate_labels(main_races, results_df, status_df, laps_df)

Calculating Excitement Labels...
Thresholds -> Chaos: >7.0 DNFs, Action: >377.5 Changes
