Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
import warnings
warnings.filterwarnings('ignore')

Upload and load dataset

In [2]:
filename = "/content/f1_pitstops_2018_2024.csv"
df = pd.read_csv(filename)

Explore dataset structure

In [3]:
# Cell 4 - Explore the new data structure
print("📊 NEW DATA STRUCTURE EXPLORATION")
print("=" * 50)

print(f"Dataset shape: {df.shape}")
print(f"Seasons: {sorted(df['Season'].unique())}")
print(f"Number of races: {df['Race Name'].nunique()}")
print(f"Number of drivers: {df['Driver'].nunique()}")

# Check what stint data we have
print(f"\n📋 Stint-related columns:")
stint_cols = [col for col in df.columns if 'stint' in col.lower() or 'Stint' in col]
print(stint_cols)

# Check tyre compounds
print(f"\n🎯 Tyre compounds available:")
print(df['Tire Compound'].value_counts())

print("\n📋 First 3 rows:")
df.head()

📊 NEW DATA STRUCTURE EXPLORATION
Dataset shape: (7374, 30)
Seasons: [np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024)]
Number of races: 30
Number of drivers: 40

📋 Stint-related columns:
['Stint', 'Stint Length']

🎯 Tyre compounds available:
Tire Compound
MEDIUM          2559
HARD            2096
SOFT            1663
INTERMEDIATE     463
ULTRASOFT        173
SUPERSOFT        164
WET               78
HYPERSOFT         54
UNKNOWN           15
Name: count, dtype: int64

📋 First 3 rows:


Unnamed: 0,Season,Round,Circuit,Driver,Constructor,Laps,Position,TotalPitStops,AvgPitStopTime,Race Name,...,Tire Usage Aggression,Fast Lap Attempts,Position Changes,Driver Aggression Score,Abbreviation,Stint,Tire Compound,Stint Length,Pit_Lap,Pit_Time
0,2018,1,Albert Park Grand Prix Circuit,Sebastian Vettel,Ferrari,58,1,1,21.787,Australian Grand Prix,...,0.017241,44.76882,0.0,6.755003,VET,1.0,ULTRASOFT,25.0,26.0,21.787
1,2018,1,Albert Park Grand Prix Circuit,Sebastian Vettel,Ferrari,58,1,1,21.787,Australian Grand Prix,...,0.017241,44.76882,0.0,6.755003,VET,2.0,SOFT,32.0,,Final Stint
2,2018,1,Albert Park Grand Prix Circuit,Lewis Hamilton,Mercedes,58,2,1,21.821,Australian Grand Prix,...,0.017241,44.73482,0.043478,6.754254,HAM,1.0,ULTRASOFT,17.0,19.0,21.821
3,2018,1,Albert Park Grand Prix Circuit,Lewis Hamilton,Mercedes,58,2,1,21.821,Australian Grand Prix,...,0.017241,44.73482,0.043478,6.754254,HAM,2.0,SOFT,39.0,,Final Stint
4,2018,1,Albert Park Grand Prix Circuit,Kimi RÃƒÂ¤ikkÃƒÂ¶nen,Ferrari,58,3,1,21.421,Australian Grand Prix,...,0.017241,45.13482,0.086957,6.818562,RAI,1.0,ULTRASOFT,17.0,18.0,21.421


FEATURE ENGINEERING FOR NEW TARGETS

In [4]:
# Feature Engineering for Stint Length & Compound Prediction
print("🔧 FEATURE ENGINEERING FOR NEW TARGETS")
print("=" * 50)

features_df = df.copy()

# 1. CURRENT STINT FEATURES
features_df['current_tyre_age'] = features_df['Stint Length']
features_df['stint_progress'] = features_df['Laps'] / features_df.groupby(['Season', 'Round', 'Driver', 'Stint'])['Laps'].transform('max')

# 2. RACE CONTEXT FEATURES
features_df['race_progress'] = features_df['Laps'] / features_df.groupby(['Season', 'Round'])['Laps'].transform('max')
features_df['position_pressure'] = 1 / features_df['Position']

# 3. PERFORMANCE FEATURES
features_df['performance_degradation'] = features_df['Lap Time Variation']
features_df['driver_aggression'] = features_df['Driver Aggression Score']

# 4. TRACK & WEATHER FEATURES
features_df['temp_difference'] = features_df['Track_Temp_C'] - features_df['Air_Temp_C']
features_df['is_raining'] = (features_df['Humidity_%'] > 80).astype(int)

# 5. HISTORICAL FEATURES (if available)
# Average stint length for this driver-track combination
driver_track_avg = features_df.groupby(['Driver', 'Circuit'])['Stint Length'].mean().reset_index()
driver_track_avg.columns = ['Driver', 'Circuit', 'driver_track_avg_stint']
features_df = features_df.merge(driver_track_avg, on=['Driver', 'Circuit'], how='left')

# 6. ENCODE CATEGORICAL VARIABLES
# Circuit encoding
circuit_encoder = LabelEncoder()
features_df['circuit_encoded'] = circuit_encoder.fit_transform(features_df['Circuit'])

# Driver encoding
driver_encoder = LabelEncoder()
features_df['driver_encoded'] = driver_encoder.fit_transform(features_df['Driver'])

# Current compound encoding
compound_map = {'SOFT': 0, 'MEDIUM': 1, 'HARD': 2, 'INTERMEDIATE': 3, 'WET': 4,
                'ULTRASOFT': 0, 'SUPERSOFT': 0, 'HYPERSOFT': 0}
features_df['current_compound_encoded'] = features_df['Tire Compound'].map(compound_map).fillna(0)

print("✅ Feature engineering completed!")
print(f"New features created: {[col for col in features_df.columns if col not in df.columns]}")

🔧 FEATURE ENGINEERING FOR NEW TARGETS
✅ Feature engineering completed!
New features created: ['current_tyre_age', 'stint_progress', 'race_progress', 'position_pressure', 'performance_degradation', 'driver_aggression', 'temp_difference', 'is_raining', 'driver_track_avg_stint', 'circuit_encoded', 'driver_encoded', 'current_compound_encoded']


CREATE TARGET VARIABLES

In [5]:
# Cell - FIXED TARGET CREATION
print("🎯 CREATING TARGET VARIABLES")
print("=" * 50)

# Sort by driver, race, stint to get next stint information
features_df = features_df.sort_values(['Driver', 'Season', 'Round', 'Stint']).reset_index(drop=True)

# Initialize target columns
features_df['next_stint_length'] = np.nan
features_df['next_compound'] = None

# Counter for tracking
targets_created = 0

# For each driver in each race, get next stint info
for driver in features_df['Driver'].unique():
    driver_races = features_df[features_df['Driver'] == driver]['Race Name'].unique()

    for race in driver_races:
        race_data = features_df[(features_df['Driver'] == driver) & (features_df['Race Name'] == race)]

        if len(race_data) > 1:
            stints = race_data['Stint'].unique()
            stints.sort()

            for i in range(len(stints) - 1):
                current_stint = stints[i]
                next_stint = stints[i + 1]

                # Get current stint row (take first one if multiple)
                current_stint_data = race_data[race_data['Stint'] == current_stint]
                next_stint_data = race_data[race_data['Stint'] == next_stint]

                # Only proceed if both stints have data
                if len(current_stint_data) > 0 and len(next_stint_data) > 0:
                    current_idx = current_stint_data.index[0]
                    next_stint_row = next_stint_data.iloc[0]

                    # Set targets
                    features_df.loc[current_idx, 'next_stint_length'] = next_stint_row['Stint Length']
                    features_df.loc[current_idx, 'next_compound'] = next_stint_row['Tire Compound']
                    targets_created += 1

print(f"✅ Targets created for {targets_created} stints")
print("📊 Target variable distribution:")

# Check if we have any targets created
if targets_created > 0:
    print(f"Next stint length stats:")
    print(features_df['next_stint_length'].describe())
    print(f"\nNext compounds:")
    print(features_df['next_compound'].value_counts())
else:
    print(" No targets were created. Checking data structure...")
    print(f"Unique stints per driver-race:")
    for driver in features_df['Driver'].unique()[:3]:  # Check first 3 drivers
        driver_races = features_df[features_df['Driver'] == driver]['Race Name'].unique()
        for race in driver_races[:2]:  # Check first 2 races per driver
            race_data = features_df[(features_df['Driver'] == driver) & (features_df['Race Name'] == race)]
            print(f"Driver: {driver}, Race: {race}, Stints: {race_data['Stint'].unique()}")

🎯 CREATING TARGET VARIABLES
✅ Targets created for 1937 stints
📊 Target variable distribution:
Next stint length stats:
count    1937.000000
mean       20.402685
std        13.369586
min         1.000000
25%        11.000000
50%        19.000000
75%        29.000000
max        68.000000
Name: next_stint_length, dtype: float64

Next compounds:
next_compound
SOFT            565
HARD            563
MEDIUM          500
INTERMEDIATE    152
SUPERSOFT        73
ULTRASOFT        63
WET              12
HYPERSOFT         9
Name: count, dtype: int64


PREPARE DATA FOR MODELING

In [6]:
# PREPARE DATA FOR MODELING
print("📦 PREPARING DATA FOR LIGHTGBM")
print("=" * 50)

# Select features for prediction
feature_columns = [
    'current_tyre_age', 'stint_progress', 'race_progress',
    'position_pressure', 'performance_degradation', 'driver_aggression',
    'current_compound_encoded', 'circuit_encoded', 'driver_encoded',
    'Air_Temp_C', 'Track_Temp_C', 'Humidity_%', 'temp_difference', 'is_raining',
    'driver_track_avg_stint', 'Position', 'Laps'
]

# Remove rows without target values
model_data = features_df[features_df['next_stint_length'].notna() & features_df['next_compound'].notna()]
model_data = model_data[feature_columns + ['next_stint_length', 'next_compound']].dropna()

print(f"✅ Final modeling dataset: {model_data.shape}")

# Encode next compound for classification
from sklearn.preprocessing import LabelEncoder
compound_encoder = LabelEncoder()
model_data['next_compound_encoded'] = compound_encoder.fit_transform(model_data['next_compound'])

X = model_data[feature_columns]
y_regression = model_data['next_stint_length']  # Regression target
y_classification = model_data['next_compound_encoded']  # Classification target

print(f"Features shape: {X.shape}")
print(f"Regression target stats:")
print(f"  Mean: {y_regression.mean():.2f}, Std: {y_regression.std():.2f}")
print(f"  Min: {y_regression.min():.2f}, Max: {y_regression.max():.2f}")
print(f"Classification target distribution:")
compound_counts = pd.Series(compound_encoder.inverse_transform(y_classification)).value_counts()
print(compound_counts)

# Show class mapping
print(f"\n🎯 Compound encoding:")
for i, compound in enumerate(compound_encoder.classes_):
    count = compound_counts.get(compound, 0)
    print(f"  {i}: {compound} ({count} samples)")

📦 PREPARING DATA FOR LIGHTGBM
✅ Final modeling dataset: (1895, 19)
Features shape: (1895, 17)
Regression target stats:
  Mean: 20.28, Std: 13.40
  Min: 1.00, Max: 68.00
Classification target distribution:
SOFT            562
HARD            539
MEDIUM          487
INTERMEDIATE    150
SUPERSOFT        73
ULTRASOFT        63
WET              12
HYPERSOFT         9
Name: count, dtype: int64

🎯 Compound encoding:
  0: HARD (539 samples)
  1: HYPERSOFT (9 samples)
  2: INTERMEDIATE (150 samples)
  3: MEDIUM (487 samples)
  4: SOFT (562 samples)
  5: SUPERSOFT (73 samples)
  6: ULTRASOFT (63 samples)
  7: WET (12 samples)


TRAIN LIGHTGBM FOR REGRESSION (Stint Length)

In [7]:
#  TRAIN REGRESSION MODEL
print("🤖 TRAINING REGRESSION MODEL (Next Stint Length)")
print("=" * 50)

from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_regression, y_classification, test_size=0.2, random_state=42,
    stratify=y_classification  # Important for imbalanced compounds
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Train Regression Model for Stint Length
reg_model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    random_state=42,
    objective='regression',
    metric='mae'
)

reg_model.fit(X_train, y_reg_train,
              eval_set=[(X_test, y_reg_test)],
              eval_metric='mae')

print("✅ Regression model trained!")

🤖 TRAINING REGRESSION MODEL (Next Stint Length)
Training set: (1516, 17)
Test set: (379, 17)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 1516, number of used features: 16
[LightGBM] [Info] Start training from score 20.277704
✅ Regression model trained!


TRAIN CLASSIFICATION MODEL

In [8]:
#  TRAIN CLASSIFICATION MODEL
print("🤖 TRAINING CLASSIFICATION MODEL (Next Compound)")
print("=" * 50)

clf_model = lgb.LGBMClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    class_weight='balanced'
)

# FIXED: Remove verbose parameter
clf_model.fit(X_train, y_clf_train,
              eval_set=[(X_test, y_clf_test)])

print("✅ Classification model trained!")

🤖 TRAINING CLASSIFICATION MODEL (Next Compound)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 1516, number of used features: 16
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
✅ Classification model trained!


EVALUATE BOTH MODELS

In [9]:
#  EVALUATE MODELS
print("📊 MODEL EVALUATION")
print("=" * 50)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
import numpy as np

# Regression Evaluation
y_reg_pred = reg_model.predict(X_test)
reg_mae = mean_absolute_error(y_reg_test, y_reg_pred)
reg_rmse = np.sqrt(mean_squared_error(y_reg_test, y_reg_pred))
reg_r2 = r2_score(y_reg_test, y_reg_pred)

print("🎯 REGRESSION RESULTS (Next Stint Length):")
print(f"MAE: {reg_mae:.3f} laps")
print(f"RMSE: {reg_rmse:.3f} laps")
print(f"R² Score: {reg_r2:.3f}")

# Show some actual vs predicted
print(f"\n📋 Sample Predictions vs Actual:")
sample_indices = np.random.choice(len(X_test), min(5, len(X_test)), replace=False)
for i, idx in enumerate(sample_indices):
    print(f"  Actual: {y_reg_test.iloc[idx]:.1f} laps, Predicted: {y_reg_pred[idx]:.1f} laps")

# Classification Evaluation
y_clf_pred = clf_model.predict(X_test)
y_clf_proba = clf_model.predict_proba(X_test)
clf_accuracy = accuracy_score(y_clf_test, y_clf_pred)

print(f"\n🎯 CLASSIFICATION RESULTS (Next Compound):")
print(f"Accuracy: {clf_accuracy:.3f}")
print("\nDetailed Classification Report:")
print(classification_report(y_clf_test, y_clf_pred,
                          target_names=compound_encoder.classes_))

# Show class distribution
print(f"\n📈 Class Distribution in Test Set:")
test_compound_counts = pd.Series(compound_encoder.inverse_transform(y_clf_test)).value_counts()
for compound, count in test_compound_counts.items():
    print(f"  {compound}: {count} samples")

📊 MODEL EVALUATION
🎯 REGRESSION RESULTS (Next Stint Length):
MAE: 8.293 laps
RMSE: 11.114 laps
R² Score: 0.342

📋 Sample Predictions vs Actual:
  Actual: 2.0 laps, Predicted: 5.7 laps
  Actual: 13.0 laps, Predicted: 19.4 laps
  Actual: 17.0 laps, Predicted: 18.5 laps
  Actual: 25.0 laps, Predicted: 15.9 laps
  Actual: 26.0 laps, Predicted: 26.3 laps

🎯 CLASSIFICATION RESULTS (Next Compound):
Accuracy: 0.483

Detailed Classification Report:
              precision    recall  f1-score   support

        HARD       0.57      0.52      0.54       108
   HYPERSOFT       0.00      0.00      0.00         2
INTERMEDIATE       0.59      0.67      0.62        30
      MEDIUM       0.42      0.41      0.41        97
        SOFT       0.49      0.54      0.51       112
   SUPERSOFT       0.29      0.40      0.33        15
   ULTRASOFT       0.25      0.08      0.12        13
         WET       0.00      0.00      0.00         2

    accuracy                           0.48       379
   macro avg  

CREATE PREDICTION FUNCTION WITH TOP 3 COMPOUNDS

In [10]:
# Cell - CREATE PREDICTION FUNCTION
print("CREATING PREDICTION FUNCTION")
print("=" * 50)

def predict_strategy(features):
    """
    Predict next stint length and top 3 compounds with probabilities
    Returns exactly the format you need:
    - Next Stint Length (laps): 12.4
    - Top-3 Compounds with probabilities
    """
    # Ensure features are in correct order and shape
    if isinstance(features, pd.Series):
        features = features[feature_columns].values.reshape(1, -1)
    else:
        features = np.array(features).reshape(1, -1)

    # Predict stint length
    stint_length = reg_model.predict(features)[0]

    # Predict compound probabilities
    compound_probs = clf_model.predict_proba(features)[0]

    # Get top 3 compounds with probabilities
    top_3_indices = compound_probs.argsort()[-3:][::-1]  # Descending order
    top_3_compounds = []

    for idx in top_3_indices:
        compound_name = compound_encoder.inverse_transform([idx])[0]
        probability = compound_probs[idx]
        top_3_compounds.append((compound_name, probability))

    return stint_length, top_3_compounds

def format_prediction(stint_length, compounds):
    """Format the prediction in your required output format"""
    print(" FINAL PREDICTION:")
    print(f"Next Stint Length(laps): {stint_length:.1f}")
    print("Top-3 Compounds(probabilities):")
    for compound, prob in compounds:
        print(f"  {compound}: {prob:.3f}")

# Test the function
print("TESTING PREDICTION FUNCTION:")
sample_idx = np.random.randint(0, len(X_test))
sample_features = X_test.iloc[sample_idx]

stint_pred, compounds_pred = predict_strategy(sample_features)
format_prediction(stint_pred, compounds_pred)

# Show actual values for comparison
print(f"\n ACTUAL VALUES:")
print(f"Next Stint Length: {y_reg_test.iloc[sample_idx]:.1f} laps")
actual_compound = compound_encoder.inverse_transform([y_clf_test.iloc[sample_idx]])[0]
print(f"Next Compound: {actual_compound}")

# Calculate accuracy for this prediction
stint_error = abs(stint_pred - y_reg_test.iloc[sample_idx])
compound_correct = (compounds_pred[0][0] == actual_compound)
print(f"\n Prediction Quality:")
print(f"Stint Length Error: {stint_error:.1f} laps")
print(f"Top Compound Correct: {compound_correct}")

CREATING PREDICTION FUNCTION
TESTING PREDICTION FUNCTION:
 FINAL PREDICTION:
Next Stint Length(laps): 19.1
Top-3 Compounds(probabilities):
  MEDIUM: 0.927
  SOFT: 0.057
  HARD: 0.016

 ACTUAL VALUES:
Next Stint Length: 23.0 laps
Next Compound: MEDIUM

 Prediction Quality:
Stint Length Error: 3.9 laps
Top Compound Correct: True
