In [None]:
# --- Imports & Configuration ---
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
DATA_PATH = 'data/'  # Folder where you unzipped the Kaggle dataset
START_YEAR = 2000    # We focus on the modern era for data consistency

In [None]:
# --- Data Loading ---
def load_data():
    print("Loading Vopani Dataset CSVs...")
    # Load core files with '\N' handling for nulls
    races = pd.read_csv(f'{DATA_PATH}races.csv', na_values='\\N')
    results = pd.read_csv(f'{DATA_PATH}results.csv', na_values='\\N')
    status = pd.read_csv(f'{DATA_PATH}status.csv', na_values='\\N')
    qualifying = pd.read_csv(f'{DATA_PATH}qualifying.csv', na_values='\\N')
    standings = pd.read_csv(f'{DATA_PATH}driver_standings.csv', na_values='\\N')
    circuits = pd.read_csv(f'{DATA_PATH}circuits.csv', na_values='\\N')
    lap_times = pd.read_csv(f'{DATA_PATH}lap_times.csv', na_values='\\N')
    weather = pd.read_csv(f'{DATA_PATH}weather.csv', na_values='\\N')
    print("Datasets are loaded in...")

    return races, results, status, qualifying, standings, circuits, lap_times, weather

# Load the raw dataframes
races_df, results_df, status_df, qual_df, stand_df, circuits_df, laps_df, weather_df = load_data()

In [3]:
# --- Cleaning & Weather Merging ---
def clean_and_merge_weather(races, weather):
    print("Processing & Merging Weather Data...")
    
    # 1. Extract Year from Weather Date
    weather['datetime'] = pd.to_datetime(weather['datetime'])
    weather['year'] = weather['datetime'].dt.year
    
    # 2. Create "Is Rain" Feature
    # Logic: If precipitation > 0.0, the track is wet.
    weather['is_rain'] = (weather['precipitation'] > 0).astype(int)

    # 3. Merge Weather with Official Race List
    # We join on ['year', 'round'] to align the weather with the correct Race ID
    merged_races = pd.merge(
        races, 
        weather[['year', 'round', 'temperature', 'precipitation', 'is_rain']], 
        on=['year', 'round'], 
        how='left'
    )
    
    # 4. Filter for Modern Era (2000+)
    merged_races = merged_races[merged_races['year'] >= START_YEAR].copy()
    
    # 5. Handle Missing Weather (for races before 2019 or missing rows)
    # We fill with defaults (Dry, 20Â°C) so the model doesn't crash
    merged_races['precipitation'] = merged_races['precipitation'].fillna(0)
    merged_races['temperature'] = merged_races['temperature'].fillna(20.0)
    merged_races['is_rain'] = merged_races['is_rain'].fillna(0).astype(int)

    print(f"Total Races processed: {merged_races.shape[0]}")
    return merged_races

# Execute Cleaning
main_races = clean_and_merge_weather(races_df, weather_df)

Processing & Merging Weather Data...
Total Races processed: 479
