# Feature Engineering for Final Submission

In [120]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import fastf1 as f1

In [121]:
rounds_by_year = {
    2022 : 21,
    2023 : 21,
    2024 : 19,
    2025 : 10
}

In [122]:
def load_data(year, round_number):
    lap_file = f'../data/{year}/{year}_round_{round_number}_laps.csv'
    weather_file = f'../data/{year}/{year}_round_{round_number}_weather.csv'
    laps = pd.read_csv(lap_file)
    weather = pd.read_csv(weather_file)
    return laps, weather

In [123]:
def process_laps_data(laps):
    laps = laps.sort_values(['Driver', 'Stint', 'LapNumber'])
    stint_intervals = laps.groupby(['Driver', 'Stint']).agg(
        StartTime=('Time', 'min'),
        EndTime=('Time', 'max'),
        StintLength=('LapNumber', 'count'),
        Compound=('Compound', 'first'),
        StartingTyreLife=('TyreLife', 'first')
    ).reset_index()
    stint_intervals['time_tire_stint'] = stint_intervals['EndTime'] - stint_intervals['StartTime']
    return stint_intervals, laps 

In [124]:
def summarize_weather(weather, stint_intervals):
    weather_summaries = []
    for _, stint in stint_intervals.iterrows():
        stint_weather = weather[(weather['Time'] >= stint['StartTime']) & 
                                (weather['Time'] <= stint['EndTime'])]
        if not stint_weather.empty:
            summary = {
                'Driver': stint['Driver'],
                'Stint': stint['Stint'],
                'Avg_AirTemp': stint_weather['AirTemp'].mean(),
                'Max_TrackTemp': stint_weather['TrackTemp'].max(),
                'Avg_Humidity': stint_weather['Humidity'].mean(),
                'Max_Rainfall': stint_weather['Rainfall'].max(),
                'Avg_WindSpeed': stint_weather['WindSpeed'].mean()
            }
        else:
            summary = {
                'Driver': stint['Driver'],
                'Stint': stint['Stint'],
                'Avg_AirTemp': np.nan,
                'Max_TrackTemp': np.nan,
                'Avg_Humidity': np.nan,
                'Max_Rainfall': np.nan,
                'Avg_WindSpeed': np.nan
            }
        weather_summaries.append(summary)
    return pd.DataFrame(weather_summaries)

In [125]:
races = [(year, round_number) for year in rounds_by_year for round_number in range(1, rounds_by_year[year] + 1)]
schedules = {year: f1.get_event_schedule(year) for year in rounds_by_year} #get the schedule information
all_laps_data = []

for year, round_number in races:
    laps, weather = load_data(year, round_number)
    if laps is not None and weather is not None:
        stint_intervals, laps = process_laps_data(laps)  # Correct unpacking
        weather_summary = summarize_weather(weather, stint_intervals)
        laps = laps.merge(stint_intervals[['Driver', 'Stint', 'time_tire_stint', 'StintLength', 'StartingTyreLife']], 
                         on=['Driver', 'Stint'], how='left')
        laps = laps.merge(weather_summary, on=['Driver', 'Stint'], how='left')
        laps['Year'] = year
        laps['RoundNumber'] = round_number
        
        schedule_year = schedules.get(year)
        circuit_row = schedule_year[schedule_year['RoundNumber'] == round_number]
        circuit = circuit_row['Location'].values[0]
        laps['Circuit'] = circuit
        all_laps_data.append(laps)

all_data = pd.concat(all_laps_data, ignore_index=True)
all_data['driv'] = all_data['Driver']
print("Columns in all_data:", all_data.columns.tolist())

Columns in all_data: ['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'time_tire_stint', 'StintLength', 'StartingTyreLife', 'Avg_AirTemp', 'Max_TrackTemp', 'Avg_Humidity', 'Max_Rainfall', 'Avg_WindSpeed', 'Year', 'RoundNumber', 'Circuit', 'driv']


In [126]:

all_data = all_data.sort_values(['Year', 'RoundNumber', 'Driver', 'Stint', 'LapNumber'])
all_data['StintLapNumber'] = all_data.groupby(['Year', 'RoundNumber', 'Driver', 'Stint']).cumcount() + 1
all_data['RemainingLaps'] = all_data['StintLength'] - all_data['StintLapNumber']

In [127]:
for feature in ['Avg_AirTemp', 'Max_TrackTemp', 'Avg_Humidity', 'Max_Rainfall', 'Avg_WindSpeed']:
    all_data[feature] = all_data.groupby(['Year', 'RoundNumber'])[feature].transform(lambda x: x.fillna(x.mean()))

  all_data[feature] = all_data.groupby(['Year', 'RoundNumber'])[feature].transform(lambda x: x.fillna(x.mean()))


In [128]:
# Cell 10: Encode Categorical Features
categorical_features = ['Compound', 'FreshTyre', 'Rainfall', 'Circuit', 'Driver']
categorical_features = [col for col in categorical_features if col in all_data.columns]
all_data = pd.get_dummies(all_data, columns=categorical_features)
print(all_data.columns.tolist())

['Time', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'TyreLife', 'Team', 'LapStartTime', 'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'time_tire_stint', 'StintLength', 'StartingTyreLife', 'Avg_AirTemp', 'Max_TrackTemp', 'Avg_Humidity', 'Max_Rainfall', 'Avg_WindSpeed', 'Year', 'RoundNumber', 'driv', 'StintLapNumber', 'RemainingLaps', 'Compound_HARD', 'Compound_INTERMEDIATE', 'Compound_MEDIUM', 'Compound_SOFT', 'Compound_WET', 'FreshTyre_False', 'FreshTyre_True', 'Circuit_Austin', 'Circuit_Baku', 'Circuit_Barcelona', 'Circuit_Budapest', 'Circuit_Imola', 'Circuit_Jeddah', 'Circuit_Las Vegas', 'Circuit_Le Castellet', 'Circuit_Lusail', 'Circuit_Marina Bay', 'Circuit_Melbourne', 'Circuit_Mexico City', 'Circuit_Miami', 'Circuit_Monac

In [129]:
exclude_cols = ['Year', 'RoundNumber', 'Driver', 'Stint', 'LapNumber', 'Time', 'RemainingLaps', 'StintLength', 'TrackStatus']
feature_cols = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST',
                'TyreLife', 'Position', 'StintLapNumber', 'time_tire_stint', 'StartingTyreLife', 'Avg_AirTemp', 
                'Max_TrackTemp', 'Avg_Humidity', 'Max_Rainfall', 'Avg_WindSpeed', 'LapNumber']
feature_cols += [col for col in all_data.columns if col.startswith('Compound_') or 
                 col.startswith('FreshTyre_') or col.startswith('Rainfall_') or col.startswith('Circuit_') or 
                 col.startswith('Driver_')]
feature_cols = [col for col in feature_cols if col in all_data.columns]

numeric_cols = [col for col in feature_cols if not (col.startswith('Compound_') or 
                                                    col.startswith('FreshTyre_') or 
                                                    col.startswith('Rainfall_') or 
                                                    col.startswith('Circuit_') or 
                                                    col.startswith('Driver_') or
                                                    col.startswith('LapNumber'))]
categorical_cols = [col for col in feature_cols if col not in numeric_cols]
numeric_indices = [feature_cols.index(col) for col in numeric_cols]

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'TyreLife', 'Position', 'StintLapNumber', 'time_tire_stint', 'StartingTyreLife', 'Avg_AirTemp', 'Max_TrackTemp', 'Avg_Humidity', 'Max_Rainfall', 'Avg_WindSpeed']
Categorical columns: ['LapNumber', 'Compound_HARD', 'Compound_INTERMEDIATE', 'Compound_MEDIUM', 'Compound_SOFT', 'Compound_WET', 'FreshTyre_False', 'FreshTyre_True', 'Circuit_Austin', 'Circuit_Baku', 'Circuit_Barcelona', 'Circuit_Budapest', 'Circuit_Imola', 'Circuit_Jeddah', 'Circuit_Las Vegas', 'Circuit_Le Castellet', 'Circuit_Lusail', 'Circuit_Marina Bay', 'Circuit_Melbourne', 'Circuit_Mexico City', 'Circuit_Miami', 'Circuit_Monaco', 'Circuit_Montréal', 'Circuit_Monza', 'Circuit_Sakhir', 'Circuit_Shanghai', 'Circuit_Silverstone', 'Circuit_Spa-Francorchamps', 'Circuit_Spielberg', 'Circuit_Suzuka', 'Circuit_São Paulo', 'Circuit_Zandvoort', 'Driver_ALB', 'Driver_ALO', 'Driver_ANT', 'Driver_BEA', 'Driver_BOR', 'D

In [130]:
def create_sequences(data, feature_cols, window_size=5):
    sequences, targets = [], []
    grouped = data.groupby(['Year', 'RoundNumber', 'driv', 'Stint'])
    for _, group in grouped:
        if len(group) >= window_size:
            for i in range(len(group) - window_size + 1):
                sequence = group.iloc[i:i + window_size][feature_cols].values
                sequences.append(sequence)
                target = group['RemainingLaps'].iloc[i + window_size - 1]
                targets.append(target)
    return np.array(sequences), np.array(targets)

window_size = 2
X, y = create_sequences(all_data, feature_cols, window_size)

In [131]:
train_data = all_data[all_data['Year'].isin([2022, 2023])]
val_data = all_data[all_data['Year'] == 2024]
test_data = all_data[all_data['Year'] == 2025]

window_size = 2
X_train, y_train = create_sequences(train_data, feature_cols, window_size)
X_val, y_val = create_sequences(val_data, feature_cols, window_size)
X_test, y_test = create_sequences(test_data, feature_cols, window_size)


In [132]:
scaler = StandardScaler()

X_train_numeric = X_train[:, :, numeric_indices]
X_val_numeric = X_val[:, :, numeric_indices]
X_test_numeric = X_test[:, :, numeric_indices]

X_train_numeric_flat = X_train_numeric.reshape(-1, len(numeric_indices))
X_val_numeric_flat = X_val_numeric.reshape(-1, len(numeric_indices))
X_test_numeric_flat = X_test_numeric.reshape(-1, len(numeric_indices))

scaler.fit(X_train_numeric_flat)

X_train_numeric_scaled = scaler.transform(X_train_numeric_flat).reshape(X_train.shape[0], window_size, len(numeric_indices))
X_val_numeric_scaled = scaler.transform(X_val_numeric_flat).reshape(X_val.shape[0], window_size, len(numeric_indices))
X_test_numeric_scaled = scaler.transform(X_test_numeric_flat).reshape(X_test.shape[0], window_size, len(numeric_indices))

X_train_scaled = np.copy(X_train)
X_val_scaled = np.copy(X_val)
X_test_scaled = np.copy(X_test)

X_train_scaled[:, :, numeric_indices] = X_train_numeric_scaled
X_val_scaled[:, :, numeric_indices] = X_val_numeric_scaled
X_test_scaled[:, :, numeric_indices] = X_test_numeric_scaled

X_train = X_train_scaled
X_val = X_val_scaled
X_test = X_test_scaled

In [133]:
print(f"Training sequences: {X_train.shape}, Targets: {y_train.shape}")
print(f"Validation sequences: {X_val.shape}, Targets: {y_val.shape}")
print(f"Test sequences: {X_test.shape}, Targets: {y_test.shape}")

Training sequences: (43275, 2, 81), Targets: (43275,)
Validation sequences: (20319, 2, 81), Targets: (20319,)
Test sequences: (10208, 2, 81), Targets: (10208,)


In [134]:
def save_sequences_to_csv(X, y, feature_cols, window_size, filename):
    n_samples, n_timesteps, n_features = X.shape
    columns = [f"{col}_t{i+1}" for i in range(n_timesteps) for col in feature_cols]
    columns.append('RemainingLaps')
    
    X_flat = X.reshape(n_samples, -1)
    data = np.hstack((X_flat, y.reshape(-1, 1)))
    df = pd.DataFrame(data, columns=columns)
    df.to_csv(filename, index=False)
    print(f"Saved {filename} with shape {df.shape}")

save_sequences_to_csv(X_train, y_train, feature_cols, window_size, 'train_data.csv')
save_sequences_to_csv(X_val, y_val, feature_cols, window_size, 'val_data.csv')
save_sequences_to_csv(X_test, y_test, feature_cols, window_size, 'test_data.csv')

Saved train_data.csv with shape (43275, 163)
Saved val_data.csv with shape (20319, 163)
Saved test_data.csv with shape (10208, 163)
