### Best Team Lineup
Build the best possible team lineup based on driver performance trends.


In [59]:
import pandas as pd
import numpy as np

df = pd.read_csv('../../../data/processed/fully_integrated_data.csv')
print("Columns in your dataset:", df.columns.tolist())
df.head()

Columns in your dataset: ['raceId', 'season', 'raceNumber', 'circuitId', 'prixName', 'raceDate', 'driverId', 'constructorId', 'driverStartGridPos', 'driverFinalGridPos', 'driverFinalRank', 'driverRacePoints', 'driverLapCount', 'driverFatestLapNumber', 'driverFastestLapTime', 'driverFastestLapSpeed', 'constructorName', 'constructorNationality', 'constructorChampionshipStandingPoints', 'constructorChampionshipStandingPosition', 'constructorChampionshipStandingWins', 'constructorRacePoints', 'driverDateOfBirth', 'driverNationality', 'driverChampionshipStandingPoints', 'driverChampionshipStandingPosition', 'driverChampionshipStandingWins', 'circuitName', 'circuitLocation', 'circuitCountry', 'lat', 'lng', 'alt', 'driverRaceResultStatus', 'driverName', 'driverAge', 'race_time']


Unnamed: 0,raceId,season,raceNumber,circuitId,prixName,raceDate,driverId,constructorId,driverStartGridPos,driverFinalGridPos,...,circuitName,circuitLocation,circuitCountry,lat,lng,alt,driverRaceResultStatus,driverName,driverAge,race_time
0,1,2009,1,1,Australian Grand Prix,2009-03-29,18,23,1,1.0,...,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,Finished,Jenson Button,45.0,01:34:15.784
1,1,2009,1,1,Australian Grand Prix,2009-03-29,22,23,2,2.0,...,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,Finished,Rubens Barrichello,53.0,01:34:16.591
2,1,2009,1,1,Australian Grand Prix,2009-03-29,15,7,20,3.0,...,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,Finished,Jarno Trulli,51.0,01:34:17.388
3,1,2009,1,1,Australian Grand Prix,2009-03-29,10,7,19,4.0,...,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,Finished,Timo Glock,43.0,01:34:20.219
4,1,2009,1,1,Australian Grand Prix,2009-03-29,4,4,10,5.0,...,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,Finished,Fernando Alonso,44.0,01:34:20.663


In [60]:
df = df.drop_duplicates()
# Handle outliers

# Remove invalid driverFinalRank (e.g., 0 or >20)
df = df[(df['driverFinalRank'] > 0) & (df['driverFinalRank'] <= 20)]

# Remove invalid grid positions (driverStartGridPos > 20)
df = df[df['driverStartGridPos'] <= 20]

# Remove rows where driverFastestLapTime is invalid
df = df[df['driverFastestLapTime'] != '0:00.000']

In [61]:
df.to_csv("cleaned_driver_data.csv", index=False)


# Feature Extraction

In [62]:
df = pd.read_csv("cleaned_driver_data.csv")

# Group by driver and season (if needed)
driver_group = df.groupby(['driverName', 'season'])

In [63]:
# --- Driver Performance Metrics ---
# Total races per driver
total_races = driver_group.size().reset_index(name='total_races')

# --- Driver Performance Metrics ---
# Total races per driver
total_races = driver_group.size().reset_index(name='total_races')

# Wins (driverFinalRank = 1)
wins = driver_group['driverFinalRank'].apply(lambda x: (x == 1).sum()).reset_index(name='wins')

# Podiums (driverFinalRank ≤ 3)
podiums = driver_group['driverFinalRank'].apply(lambda x: (x <= 3).sum()).reset_index(name='podiums')

avg_points = driver_group['driverRacePoints'].mean().reset_index(name='avg_points')

# Consistency (std of driverFinalRank)
consistency = driver_group['driverFinalRank'].std().reset_index(name='consistency')

# Qualifying vs Race Performance (driverStartGridPos - driverFinalGridPos)
qualifying_delta = driver_group.apply(lambda x: (x['driverStartGridPos'] - x['driverFinalGridPos']).mean()).reset_index(name='qualifying_delta')

# Lap Efficiency (avg driverFastestLapSpeed)
lap_efficiency = driver_group['driverFastestLapSpeed'].mean().reset_index(name='lap_efficiency')

# ... (previous code for wins, podiums, etc.)

# --- Constructor Strength ---
# Calculate constructor strength (avg constructorChampionshipStandingPoints)
constructor_strength = (
    df.groupby('constructorId')['constructorChampionshipStandingPoints']
    .mean()
    .reset_index(name='constructor_strength')
)

# Merge constructor strength into df with SUFFIXES to avoid conflicts
df = df.merge(constructor_strength, on='constructorId', suffixes=('', '_constructor'))

# --- Merge All Driver Metrics ---
driver_stats = (
    total_races
    .merge(wins, on=['driverName', 'season'])
    .merge(podiums, on=['driverName', 'season'])
    .merge(avg_points, on=['driverName', 'season'])
    .merge(consistency, on=['driverName', 'season'])
    .merge(qualifying_delta, on=['driverName', 'season'])
    .merge(lap_efficiency, on=['driverName', 'season'])
)

# Add constructor_strength (no need to merge again - already in driver_stats via df)
# Ensure each driver-season has 1 constructor_strength value
constructor_mapping = df[['driverName', 'season', 'constructor_strength']].drop_duplicates()
driver_stats = driver_stats.merge(constructor_mapping, on=['driverName', 'season'], how='left')

# Filter drivers with sufficient races (e.g., ≥10 races in a season)
driver_stats = driver_stats[driver_stats['total_races'] >= 10]
driver_stats

  qualifying_delta = driver_group.apply(lambda x: (x['driverStartGridPos'] - x['driverFinalGridPos']).mean()).reset_index(name='qualifying_delta')


Unnamed: 0,driverName,season,total_races,wins,podiums,avg_points,consistency,qualifying_delta,lap_efficiency,constructor_strength
3,Adrian Sutil,2008,16,0,0,0.000000,2.061553,-7.500000,197.972000,48.967005
4,Adrian Sutil,2009,17,0,0,0.294118,4.201715,-4.705882,201.857412,48.967005
5,Adrian Sutil,2010,17,0,0,2.764706,4.534184,-2.470588,199.594412,48.967005
6,Adrian Sutil,2011,18,0,0,2.333333,3.449448,0.222222,197.285722,48.967005
7,Adrian Sutil,2013,15,0,0,1.933333,4.434712,-0.666667,199.642467,48.967005
...,...,...,...,...,...,...,...,...,...,...
2683,Yuki Tsunoda,2024,24,0,0,1.208333,4.400263,-4.458333,213.041500,27.375000
2689,Zsolt Baumgartner,2004,18,0,0,0.055556,3.451342,-2.833333,203.968667,1.422460
2695,Éric Bernard,1990,10,0,0,0.400000,5.638164,-5.200000,204.894000,1.673913
2700,Érik Comas,1992,10,0,0,0.300000,5.186521,-9.000000,204.894000,12.144638


In [64]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Calculate win and podium ratios
driver_stats['win_ratio'] = driver_stats['wins'] / driver_stats['total_races']
driver_stats['podium_ratio'] = driver_stats['podiums'] / driver_stats['total_races']

# Columns to normalize (higher = better)
metrics = ['win_ratio', 'podium_ratio', 'avg_points', 'qualifying_delta', 'lap_efficiency', 'constructor_strength']

# Invert consistency (lower std = better), handle division by zero
driver_stats['consistency'] = 1 / (driver_stats['consistency'] + 1e-9) 

scaler = MinMaxScaler()
# Ensure the DataFrame slice includes only the columns to scale
driver_stats[metrics + ['consistency']] = scaler.fit_transform(driver_stats[metrics + ['consistency']])

# Rename consistency
driver_stats.rename(columns={'consistency': 'consistency_score'}, inplace=True)

# Recalculate composite score with weights summing to 1
weights = {
    'win_ratio': 0.25,
    'podium_ratio': 0.25,
    'avg_points': 0.15,
    'qualifying_delta': 0.15,
    'lap_efficiency': 0.1,
    'consistency_score': 0.1
}

driver_stats['composite_score'] = (
    driver_stats['win_ratio'] * weights['win_ratio'] +
    driver_stats['podium_ratio'] * weights['podium_ratio'] +
    driver_stats['avg_points'] * weights['avg_points'] +
    driver_stats['qualifying_delta'] * weights['qualifying_delta'] +
    driver_stats['lap_efficiency'] * weights['lap_efficiency'] +
    driver_stats['consistency_score'] * weights['consistency_score']
)

# Save updated profiles
driver_stats.to_csv("driver_profiles.csv", index=False)

In [65]:
driver_stats.head(10)

Unnamed: 0,driverName,season,total_races,wins,podiums,avg_points,consistency_score,qualifying_delta,lap_efficiency,constructor_strength,win_ratio,podium_ratio,composite_score
3,Adrian Sutil,2008,16,0,0,0.0,0.24611,0.541434,0.266178,0.180435,0.0,0.0,0.132444
4,Adrian Sutil,2009,17,0,0,0.012209,0.081585,0.635186,0.384447,0.180435,0.0,0.0,0.143712
5,Adrian Sutil,2010,17,0,0,0.114761,0.069965,0.710189,0.315563,0.180435,0.0,0.0,0.162295
6,Adrian Sutil,2011,18,0,0,0.096855,0.116147,0.800542,0.245288,0.180435,0.0,0.0,0.170753
7,Adrian Sutil,2013,15,0,0,0.080252,0.073259,0.770717,0.317025,0.180435,0.0,0.0,0.166674
8,Adrian Sutil,2014,16,0,0,0.0,0.17134,0.591764,0.190133,0.04881,0.0,0.0,0.124912
11,Aguri Suzuki,1990,10,0,1,0.024906,0.042831,0.652161,0.476878,0.006168,0.0,0.1,0.178531
13,Aguri Suzuki,1992,10,0,0,0.0,0.08158,0.860193,0.476878,0.009734,0.0,0.0,0.184875
21,Alain Prost,1981,11,3,6,0.162264,0.009876,0.356889,0.476878,0.169337,0.315789,0.545455,0.341859
22,Alain Prost,1982,15,2,4,0.094088,0.022612,0.283071,0.476878,0.169337,0.154386,0.266667,0.211786


# Optimization

In [66]:
import pandas as pd
from pulp import LpMaximize, LpProblem, LpVariable, lpSum

# Load driver profiles from Step 2
driver_stats = pd.read_csv("driver_profiles.csv")

# optimization parameters
BUDGET = 100_000_000 #Sample
MAX_DRIVERS = 2 # Maximum drivers in the lineup
MAX_DRIVERS_PER_CONSTRUCTOR = 1 # Max drivers from the same team

# Create a list of drivers
drivers = driver_stats.to_dict(orient="records")

In [67]:
# Initialize the problem
prob = LpProblem("BestTeamLineup", LpMaximize)

# Get unique driver names that exist in both dataframes
valid_drivers = set(df['driverName'].unique()) & set(driver['driverName'] for driver in drivers)

# Create binary variables for each valid driver (1 = selected, 0 = not selected)
driver_vars = {
    driver['driverName']: LpVariable(f"driver_{driver['driverName']}", 0, 1, cat="Binary")
    for driver in drivers if driver['driverName'] in valid_drivers
}

# Add objective function: Maximize total composite score
prob += lpSum(
    driver['composite_score'] * driver_vars[driver['driverName']]
    for driver in drivers if driver['driverName'] in valid_drivers
)

# Add constraints
# 1. Total number of drivers <= MAX_DRIVERS
prob += lpSum(driver_vars.values()) <= MAX_DRIVERS

# 2. Maximum drivers per constructor
constructor_groups = df.groupby('constructorId')
for constructor, group in constructor_groups:
    valid_group_drivers = set(group['driverName']) & valid_drivers
    if valid_group_drivers:
        prob += lpSum(
            driver_vars[driver_name] for driver_name in valid_group_drivers
        ) <= MAX_DRIVERS_PER_CONSTRUCTOR
prob.solve()
print("Status:", prob.status)

Status: 1


### Since Status is 1 It is Optimal

In [71]:
# Extract selected drivers
selected_drivers = []
for driver in drivers:
    if driver_vars[driver['driverName']].value() == 1:
        selected_drivers.append(driver)

# Create a DataFrame for the optimal lineup
optimal_lineup = pd.DataFrame(selected_drivers)[
    ['driverName', 'season', 'composite_score', 'win_ratio', 'podium_ratio']
]

optimal_lineup.sort_values('composite_score', ascending=False).head(2)



Unnamed: 0,driverName,season,composite_score,win_ratio,podium_ratio
13,Lewis Hamilton,2020,0.803986,0.796053,0.875
23,Sebastian Vettel,2013,0.777404,0.836257,0.888889


In [75]:
def greedy_team_selection(driver_stats, df, max_drivers=2, max_per_team=1):
    # Sort drivers by composite score (descending)
    drivers_sorted = driver_stats.sort_values('composite_score', ascending=False)
    
    selected = []
    team_counts = {}  # Track drivers per constructor
    
    for _, driver in drivers_sorted.iterrows():
        # Get the constructor for this driver from the original df
        driver_constructor = df[df['driverName'] == driver['driverName']]['constructorName'].iloc[0]
        
        # Skip if team is already full
        if team_counts.get(driver_constructor, 0) >= max_per_team:
            continue
        
        # Add driver to lineup
        selected.append(driver)
        team_counts[driver_constructor] = team_counts.get(driver_constructor, 0) + 1
        
        # Stop when lineup is full
        if len(selected) >= max_drivers:
            break
    
    return pd.DataFrame(selected)

# Run greedy algorithm
greedy_lineup = greedy_team_selection(driver_stats, df)
print("\nGreedy Lineup:")
print(greedy_lineup[['driverName', 'season', 'composite_score']])


Greedy Lineup:
         driverName  season  composite_score
656  Max Verstappen    2023         0.912693
594  Lewis Hamilton    2020         0.803986


# Results Analysis: Greedy Algorithm vs Linear Programming

The greedy algorithm demonstrated better performance compared to linear programming, producing a more optimal driver lineup with higher composite scores. This suggests that the simplified, step-by-step approach of the greedy algorithm is more effective for this specific team selection problem.

# Analysis of Team Selection Algorithm

## Algorithm Overview

The code implements a greedy team selection algorithm for Formula 1 drivers with the following characteristics:

### Key Parameters


In [None]:
max_drivers = 2      # Maximum number of drivers to select
max_per_team = 1     # Maximum drivers allowed per constructor



### Selection Criteria
- Uses a composite score to rank drivers
- Considers constructor (team) constraints
- Implements greedy selection strategy

## Code Structure Breakdown

### 1. Main Function


In [None]:
def greedy_team_selection(driver_stats, df, max_drivers=2, max_per_team=1):
    """
    Selects optimal driver lineup based on composite scores while respecting team constraints
    
    Parameters:
    - driver_stats: DataFrame with driver performance metrics
    - df: Original dataset with constructor information
    - max_drivers: Maximum drivers in lineup (default: 2)
    - max_per_team: Maximum drivers per constructor (default: 1)
    """



### 2. Selection Process
1. **Initial Sorting**
   ```python
   drivers_sorted = driver_stats.sort_values('composite_score', ascending=False)
   ```
   - Sorts drivers by composite score in descending order
   - Ensures best performers are considered first

2. **Team Tracking**
   ```python
   team_counts = {}  # Dictionary to track drivers per constructor
   ```
   - Maintains constructor quota compliance
   - Prevents over-selection from same team

3. **Driver Selection**
   - Iterates through sorted drivers
   - Checks team constraints
   - Adds qualified drivers to lineup

## Key Features

### Advantages
1. **Efficiency**: O(n log n) complexity due to sorting
2. **Constraint Handling**: Maintains team balance
3. **Flexibility**: Adjustable parameters for different scenarios


## Usage Example



In [None]:
# Example usage
greedy_lineup = greedy_team_selection(driver_stats, df)
print("\nGreedy Lineup:")
print(greedy_lineup[['driverName', 'season', 'composite_score']])