In [None]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Data Access
car_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/car_data_2024.csv")
circuit_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/circuit_2024.csv")
control_message_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/control_message_2024.csv")
lap_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/lap_2024.csv", low_memory=False)
position_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/position_2024.csv")
result_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/result_2024.csv")
session_status_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/session_status_2024.csv")
track_status_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/track_status_2024.csv")
weather_data = pd.read_csv("C:/Users/yunus/Downloads/Dataset-f1/ocean_data_challenge_F1_data/races_2024/weather_2024.csv")

In [None]:
# 1.Car_Data
# Display the first few rows and summary info
car_data.head(), car_data.info(), car_data.describe(), car_data.describe(include='object')

In [None]:
# Data Cleaning and Preparation
# Check for Missing Values
# Visualize missing values using a heatmap 
sns.heatmap(car_data.isnull(), cbar=False, cmap='viridis')

In [None]:
# Duplicate Entries: Check for and remove any duplicate rows 
car_data.duplicated().sum()
# car_data.drop_duplicates()

In [None]:
# Categorical Analysis
sns.countplot(y='EventName', data=car_data)
plt.show()

In [None]:
# Categorical Analysis
sns.countplot(y='DriverName', data=car_data)
plt.show()

In [None]:
# DriverName vs. EventName
# Create a crosstab
crosstab = pd.crosstab(car_data['EventName'], car_data['DriverName'])

# Set plot size and color palette
plt.figure(figsize=(12, 8))
crosstab.plot(kind='bar', stacked=True, color=sns.color_palette("Paired"))

# Customize the plot
plt.title('Driver Performance Across Events', fontsize=16)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Event Name', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)  # Rotate x-axis labels for readability

# Place the legend outside the plot
plt.legend(title='Driver Name', bbox_to_anchor=(1.05, 1), loc='upper left')

# Display the plot
plt.tight_layout()  # Adjust layout to avoid cutting off labels
plt.show()

In [None]:
# DriverName vs. EventName
# Create a crosstab
crosstab = pd.crosstab(car_data['EventName'], car_data['DriverName'])

# Set plot size and color palette
fig, ax = plt.subplots(figsize=(14, 8))  # Slightly larger for better readability
crosstab.plot(kind='bar', stacked=True, ax=ax, color=sns.color_palette("tab20"))  # More distinct color palette

# Customize the plot
ax.set_title('Driver Distribution Across Events', fontsize=16)
ax.set_ylabel('Count', fontsize=12)
ax.set_xlabel('Event Name', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=10)  # Rotate x-axis labels for readability

# Add gridlines for better readability
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Place the legend outside the plot and reduce font size
ax.legend(title='Driver Name', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)

# Adjust bar width slightly
for bar in ax.containers:  # Iterate over the stacked bar containers
    for patch in bar:
        patch.set_width(0.9)  # Adjust the bar width

# Display data labels on top of bars
for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='center', fontsize=8)  # Add data labels

# Adjust layout to avoid cutting off labels
plt.tight_layout()
plt.show()

In [None]:
# Distribution Analysis
car_data[['RPM', 'Speed', 'Throttle', 'DRS']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Time Series Analysis
# Convert the Date column to a datetime format: 
car_data['Date'] = pd.to_datetime(car_data['Date'])

In [None]:
# Plot the time series of key metrics such as RPM, Speed, and Throttle over time
# Set the style of seaborn for better aesthetics
sns.set(style="whitegrid")

# Group by 'Date' and calculate the mean of key metrics
time_series_data = car_data.groupby('Date')[['RPM', 'Speed']].mean() # 'Throttle', 'DRS'

# Plot the time series data
plt.figure(figsize=(14, 8))

# Plot each metric with different colors and styles
for column in time_series_data.columns:
    plt.plot(time_series_data.index, time_series_data[column], label=column, linewidth=2)

# Add titles and labels
plt.title('Time Series of Key Metrics Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Average Value', fontsize=12)
plt.legend(title='Metrics')

# Rotate date labels for better readability
plt.xticks(rotation=45)

# Add gridlines for better readability
plt.grid(True)

# Improve layout
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# Speed vs. DriverName
sns.boxplot(x='DriverName', y='Speed', data=car_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Set figure size and adjust aesthetics
plt.figure(figsize=(12, 8))  # Larger figure
sns.set(style="whitegrid")  # Clean background

# Order EventName by median Speed
order = car_data.groupby('EventName')['Speed'].median().sort_values().index

# Create boxplot with a color palette
sns.boxplot(x='EventName', y='Speed', data=car_data, palette="coolwarm", order=order)

# Add title and labels
plt.title('Speed Distribution by Event', fontsize=16)
plt.xlabel('Event Name', fontsize=12)
plt.ylabel('Speed (km/h)', fontsize=12)

# Improve xticks readability
plt.xticks(rotation=45, ha='right', fontsize=10)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Driver vs Event vs Speed
# Create a list of unique events
events = car_data['EventName'].unique()
drivers = car_data['DriverName'].unique()

# Set up a color palette for consistent coloring
palette = sns.color_palette("Set3", n_colors=len(drivers))

# Loop over each event and create a separate plot
for event in events:
    # Filter the data for the current event
    event_data = car_data[car_data['EventName'] == event]
    
    # Create the boxplot
    plt.figure(figsize=(12, 8))
    sns.boxplot(x='DriverName', y='Speed', data=event_data, hue='DriverName', palette=palette, legend=False)
    
    # Rotate x-axis labels for readability
    plt.xticks(rotation=90)
    
    # Add titles and labels
    plt.title(f'Speed Distribution for {event}', fontsize=16)
    plt.xlabel('Driver Name', fontsize=12)
    plt.ylabel('Speed', fontsize=12)
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Speed Distributions by Events

# Create a list of unique events and drivers
events = car_data['EventName'].unique()
drivers = car_data['DriverName'].unique()

# Set up a color palette for consistent coloring across all drivers
palette = sns.color_palette("Set3", n_colors=len(drivers))
driver_palette = dict(zip(drivers, palette))  # Map each driver to a unique color

# Loop over each event and create a separate plot
for event in events:
    # Filter the data for the current event
    event_data = car_data[car_data['EventName'] == event]
    
    # Calculate the median speed for each driver in this event and sort by that value
    driver_speed_median = event_data.groupby('DriverName')['Speed'].median().sort_values()
    sorted_drivers = driver_speed_median.index
    
    # Reorder the 'DriverName' column in the event_data based on the sorted drivers
    event_data.loc[:, 'DriverName'] = pd.Categorical(event_data['DriverName'], categories=sorted_drivers, ordered=True)
    
    # Create the boxplot
    plt.figure(figsize=(12, 8))
    sns.boxplot(x='DriverName', y='Speed', data=event_data, palette=driver_palette, dodge=False)
    
    # Rotate x-axis labels for readability
    plt.xticks(rotation=90)
    
    # Add titles and labels
    plt.title(f'Speed Distribution for {event}', fontsize=16)
    plt.xlabel('Driver Name (Ordered by Speed)', fontsize=12)
    plt.ylabel('Speed', fontsize=12)
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Compute and visualize the correlation matrix for numerical variables
corr = car_data[['RPM', 'Speed', 'Throttle', 'Brake', 'DRS']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Feature Engineering: Acceleration Based on RPM and Speed Changes

# Ensure 'SessionTime' is in timedelta format
car_data['SessionTime'] = pd.to_timedelta(car_data['SessionTime'])

# Convert 'Time' to timedelta format if it's not already
car_data['Time'] = pd.to_timedelta(car_data['Time'])

# Sort the data by EventName, DriverName, and SessionTime
car_data = car_data.sort_values(by=['EventName', 'DriverName', 'SessionTime'])

# Calculate change in RPM and Speed between consecutive rows
car_data['DeltaRPM'] = car_data.groupby(['EventName', 'DriverName'])['RPM'].diff()
car_data['DeltaSpeed'] = car_data.groupby(['EventName', 'DriverName'])['Speed'].diff()

# Calculate DeltaTime (difference in seconds)
car_data['DeltaTime'] = car_data.groupby(['EventName', 'DriverName'])['SessionTime'].diff().dt.total_seconds()

# Avoid dividing by zero by replacing zero or negative 'DeltaTime' with NaN
car_data['DeltaTime'] = car_data['DeltaTime'].replace(0, np.nan)

# Calculate acceleration (simplified as speed change over time)
car_data['Acceleration'] = car_data['DeltaSpeed'] / car_data['DeltaTime']

# Optional: Fill NaN values in Acceleration (based on your use case)
car_data['Acceleration'] = car_data['Acceleration'].fillna(0)  # Or handle NaNs differently

In [None]:
# Acceleration vs. DriverName vs. EventName

# Create a list of unique events
events = car_data['EventName'].unique()

# Set up a color palette for consistent coloring
palette = sns.color_palette("Set3", n_colors=len(car_data['DriverName'].unique()))

# Loop over each event and create a separate plot
for event in events:
    # Filter the data for the current event
    event_data = car_data[car_data['EventName'] == event]
    
    # Create the boxplot
    plt.figure(figsize=(12, 8))
    sns.boxplot(x='DriverName', y='Acceleration', data=event_data, hue='DriverName', palette=palette, legend=False)
    
    # Rotate x-axis labels for readability
    plt.xticks(rotation=90)
    
    # Add titles and labels
    plt.title(f'Acceleration Distribution for {event}', fontsize=16)
    plt.xlabel('Driver Name', fontsize=12)
    plt.ylabel('Acceleration', fontsize=12)
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Event and Driver Comparisons
# Compare Metrics Across Events
# Calculate average speed, acceleration, and RPM per event
event_comparison = car_data.groupby(['EventName']).agg({
    'Speed': 'mean',
    'Acceleration': 'mean',
    'RPM': 'mean'
}).reset_index()

# Sort the DataFrame by each metric for ordering
sorted_events_speed = event_comparison.sort_values(by='Speed')
sorted_events_acceleration = event_comparison.sort_values(by='Acceleration')
sorted_events_rpm = event_comparison.sort_values(by='RPM')

# Create a figure with 3 subplots
fig, ax = plt.subplots(3, 1, figsize=(14, 12))

# Average Speed
sns.barplot(x='EventName', y='Speed', data=sorted_events_speed, ax=ax[0], palette='viridis')
ax[0].set_title('Average Speed by Event')
ax[0].set_xlabel('Event Name')
ax[0].set_ylabel('Average Speed (km/h)')
ax[0].tick_params(axis='x', rotation=45)

# Average Acceleration
sns.barplot(x='EventName', y='Acceleration', data=sorted_events_acceleration, ax=ax[1], palette='viridis')
ax[1].set_title('Average Acceleration by Event')
ax[1].set_xlabel('Event Name')
ax[1].set_ylabel('Average Acceleration (km/h/s)')
ax[1].tick_params(axis='x', rotation=45)

# Average RPM
sns.barplot(x='EventName', y='RPM', data=sorted_events_rpm, ax=ax[2], palette='viridis')
ax[2].set_title('Average RPM by Event')
ax[2].set_xlabel('Event Name')
ax[2].set_ylabel('Average RPM')
ax[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Compare Metrics Across Drivers
# Calculate average speed, acceleration, and RPM per driver
driver_comparison = car_data.groupby(['DriverName']).agg({
    'Speed': 'mean',
    'Acceleration': 'mean',
    'RPM': 'mean'
}).reset_index()

# Sort the DataFrame by each metric for ordering
sorted_drivers_speed = driver_comparison.sort_values(by='Speed')
sorted_drivers_acceleration = driver_comparison.sort_values(by='Acceleration')
sorted_drivers_rpm = driver_comparison.sort_values(by='RPM')

# Create a figure with 3 subplots
fig, ax = plt.subplots(3, 1, figsize=(14, 12))

# Average Speed by Driver
sns.barplot(x='DriverName', y='Speed', data=sorted_drivers_speed, ax=ax[0], palette='viridis')
ax[0].set_title('Average Speed by Driver')
ax[0].set_xlabel('Driver Name')
ax[0].set_ylabel('Average Speed (km/h)')
ax[0].tick_params(axis='x', rotation=45)

# Average Acceleration by Driver
sns.barplot(x='DriverName', y='Acceleration', data=sorted_drivers_acceleration, ax=ax[1], palette='viridis')
ax[1].set_title('Average Acceleration by Driver')
ax[1].set_xlabel('Driver Name')
ax[1].set_ylabel('Average Acceleration (km/h/s)')
ax[1].tick_params(axis='x', rotation=45)

# Average RPM by Driver
sns.barplot(x='DriverName', y='RPM', data=sorted_drivers_rpm, ax=ax[2], palette='viridis')
ax[2].set_title('Average RPM by Driver')
ax[2].set_xlabel('Driver Name')
ax[2].set_ylabel('Average RPM')
ax[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# 2. Circuit_data
# Display the first few rows and summary info
circuit_data.head(), circuit_data.info(), circuit_data.describe(), circuit_data.describe(include='object')

In [None]:
# Data Cleaning and Preparation
# Check for Missing Values
# Visualize missing values using a heatmap 
sns.heatmap(circuit_data.isnull(), cbar=False, cmap='viridis')

In [None]:
# Duplicate Entries: Check for and remove any duplicate rows 
circuit_data.duplicated().sum()
# circuit_data.drop_duplicates()

In [None]:
# Categorical Analysis
sns.countplot(y='TrackMarker', data=circuit_data)
plt.show()

In [None]:
# Distribution Analysis
circuit_data[['Number', 'Angle', 'Distance']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Visualize Track Layout

# List of unique tracks
tracks = circuit_data['EventName'].unique()

# Create a plot for each track
for track in tracks:
    plt.figure(figsize=(12, 8))
    track_data = circuit_data[circuit_data['EventName'] == track]
    plt.plot(track_data['X'], track_data['Y'], marker='', color='gray', alpha=0.5)  # Add a line to connect the track markers
    
    # sns.scatterplot(data=track_data, x='X', y='Y', hue='TrackMarker', palette='tab10', s=100, alpha=0.8)
    sns.scatterplot(data=track_data, x='X', y='Y', hue='TrackMarker', style='TrackMarker', palette='tab10', s=100, alpha=0.8)

    plt.title(f'Track Layout for {track}')
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.legend(title='Track Marker', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Distance vs. Marker Number for Each Track

for track in tracks:
    plt.figure(figsize=(12, 6))
    track_data = circuit_data[circuit_data['EventName'] == track]

    #sns.scatterplot(data=track_data, x='Distance', y='Number', hue='TrackMarker', palette='tab10', s=100, alpha=0.8)
    sns.scatterplot(data=track_data, x='Distance', y='Number', hue='TrackMarker', style='TrackMarker', palette='tab10', s=100, alpha=0.8)

    plt.title(f'Distance vs. Marker Number for {track}')
    plt.xlabel('Distance from Start/Finish Line')
    plt.ylabel('Marker Number')
    plt.legend(title='Track Marker', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Angle Distribution with Track Labels

for track in tracks:
    plt.figure(figsize=(12, 6))
    track_data = circuit_data[circuit_data['EventName'] == track]
    
    # Plot histogram for each TrackMarker type
    for marker in track_data['TrackMarker'].unique():
        marker_data = track_data[track_data['TrackMarker'] == marker]
        plt.hist(marker_data['Angle'], bins=30, alpha=0.5, label=marker)
    
    plt.title(f'Angle Distribution for {track}')
    plt.xlabel('Angle (degrees)')
    plt.ylabel('Frequency')
    plt.legend(title='Track Marker')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# 3. Control_message_data
# Display the first few rows and summary info
control_message_data.head(), control_message_data.info(), control_message_data.describe(), control_message_data.describe(include='object')

In [None]:
# Data Cleaning and Preparation
# Check for Missing Values
# Visualize missing values using a heatmap 
sns.heatmap(control_message_data.isnull(), cbar=False, cmap='viridis')

In [None]:
# Duplicate Entries: Check for and remove any duplicate rows 
control_message_data.duplicated().sum()
# car_data.drop_duplicates()

In [None]:
# Categorical Analysis
sns.countplot(y='Category', data=control_message_data)
plt.show()

In [None]:
# Distribution Analysis
control_message_data[['Sector', 'RacingNumber', 'Lap']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Event and Category Distribution

# Convert 'Time' column to datetime
control_message_data['Time'] = pd.to_datetime(control_message_data['Time'], format='%Y-%m-%d %H:%M:%S')

# Event distribution
plt.figure(figsize=(14, 7))
event_counts = control_message_data.groupby('EventName').size().sort_values()
sns.barplot(x=event_counts.index, y=event_counts.values, palette='viridis')
plt.title('Number of Messages by Event')
plt.xlabel('Event Name')
plt.ylabel('Number of Messages')
plt.xticks(rotation=90)
plt.show()

# Category distribution by Event
plt.figure(figsize=(14, 10))
sns.countplot(data=control_message_data, x='EventName', hue='Category', palette='viridis')
plt.title('Message Categories by Event')
plt.xlabel('Event Name')
plt.ylabel('Number of Messages')
plt.xticks(rotation=90)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Message Scope distribution
scope_counts = control_message_data['Scope'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=scope_counts.index, y=scope_counts.values, palette='viridis')
plt.title('Message Scope Distribution')
plt.xlabel('Scope')
plt.ylabel('Number of Messages')
plt.show()

In [None]:
# Scope distribution by Event
plt.figure(figsize=(14, 10))
sns.countplot(data=control_message_data, x='EventName', hue='Scope', palette='viridis')
plt.title('Message Scope Distribution by Event')
plt.xlabel('Event Name')
plt.ylabel('Number of Messages')
plt.xticks(rotation=90)
plt.legend(title='Scope', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Flag and Status Analysis

# Flag distribution by Event
plt.figure(figsize=(14, 10))
sns.countplot(data=control_message_data, x='EventName', hue='Flag', palette='tab10')
plt.title('Flag Distribution by Event')
plt.xlabel('Event Name')
plt.ylabel('Number of Messages')
plt.xticks(rotation=90)
plt.legend(title='Flag', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Status distribution by Event
plt.figure(figsize=(14, 10))
sns.countplot(data=control_message_data, x='EventName', hue='Status', palette='tab10')
plt.title('Status Distribution by Event')
plt.xlabel('Event Name')
plt.ylabel('Number of Messages')
plt.xticks(rotation=90)
plt.legend(title='Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Pit Related Messages vs. Events

# Filter for messages related to pit stops
pit_stop_data = control_message_data[control_message_data['Message'].str.contains('pit', case=False, na=False)]

# group pit stops by EventName
pit_stop_counts = pit_stop_data.groupby('EventName').size().reset_index(name='PitStopCount')

# Visualization: Pit stops by event
plt.figure(figsize=(12, 6))
sns.barplot(data=pit_stop_counts, x='EventName', y='PitStopCount', palette='tab20')
plt.title('Total Number of Pit Related Messages by Event')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Number of Pit Related Messages')
plt.xlabel('Event Name')
plt.tight_layout()
plt.show()

In [None]:
# 4. Lap_data
# Display the first few rows and summary info
lap_data.head(), lap_data.info(), lap_data.describe(), lap_data.describe(include='object')

In [None]:
# Data Cleaning and Preparation
# Check for Missing Values
# Visualize missing values using a heatmap 

plt.figure(figsize=(12, 8))
sns.heatmap(lap_data.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title(f'Missing Values Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.show()

In [None]:
# Convert necessary columns to appropriate data types
lap_data['LapTime'] = pd.to_timedelta(lap_data['LapTime'], errors='coerce')
lap_data['Sector1Time'] = pd.to_timedelta(lap_data['Sector1Time'], errors='coerce')
lap_data['Sector2Time'] = pd.to_timedelta(lap_data['Sector2Time'], errors='coerce')
lap_data['Sector3Time'] = pd.to_timedelta(lap_data['Sector3Time'], errors='coerce')
lap_data['Sector1SessionTime'] = pd.to_timedelta(lap_data['Sector1SessionTime'], errors='coerce')
lap_data['Sector2SessionTime'] = pd.to_timedelta(lap_data['Sector2SessionTime'], errors='coerce')
lap_data['Sector3SessionTime'] = pd.to_timedelta(lap_data['Sector3SessionTime'], errors='coerce')

In [None]:
# Distribution of Lap Times

# Convert LapTime to seconds for easier plotting
lap_data['LapTimeSeconds'] = lap_data['LapTime'].dt.total_seconds()

# Set up a FacetGrid for multiple events
g = sns.FacetGrid(lap_data, col='EventName', col_wrap=3, height=4, sharex=False, sharey=False)

# Plot the distribution of Lap Times for each event
g.map(sns.histplot, 'LapTimeSeconds', kde=True, bins=50, color='blue')

# Add axis labels and title
g.set_axis_labels('Lap Time (seconds)', 'Frequency')
g.set_titles('{col_name}')
plt.suptitle('Distribution of Lap Times by Event', y=1.03, fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Lap Time vs Lap Number for different drivers

# Ensure 'LapNumber' and 'LapTimeSeconds' are numeric
lap_data['LapNumber'] = pd.to_numeric(lap_data['LapNumber'], errors='coerce')
lap_data['LapTimeSeconds'] = pd.to_numeric(lap_data['LapTimeSeconds'], errors='coerce')

# Create separate plots for each event
events = lap_data['EventName'].unique()

for event in events:
    event_data = lap_data[lap_data['EventName'] == event]

    # Plot Lap Time vs Lap Number for each driver in the event
    plt.figure(figsize=(14, 8))
    
    for driver in event_data['Driver'].unique():
        driver_laps = event_data[event_data['Driver'] == driver]
        plt.plot(driver_laps['LapNumber'], driver_laps['LapTimeSeconds'], label=driver, linestyle='-', marker='o')

    # Add title and labels
    plt.title(f'Lap Time vs Lap Number (per Driver) - {event}')
    plt.xlabel('Lap Number')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Average Lap Time per Driver

for event in events:
    # Filter data by event and remove missing lap times
    event_data = lap_data[lap_data['EventName'] == event].dropna(subset=['LapTimeSeconds'])

    # Calculate average lap time per driver for this event
    average_lap_time_per_driver = event_data.groupby('Driver')['LapTimeSeconds'].mean().sort_values()

    # Create bar plot for the current event
    plt.figure(figsize=(10, 6))
    sns.barplot(x=average_lap_time_per_driver.index, y=average_lap_time_per_driver.values, palette='coolwarm')
    
    # Add title and labels
    plt.title(f'Average Lap Time per Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Average Lap Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    
    # Plot overall average lap time for reference
    overall_avg_lap_time = event_data['LapTimeSeconds'].mean()
    plt.axhline(overall_avg_lap_time, color='red', linestyle='--', label='Overall Average')
    plt.legend()

    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Track Driver Positions Throughout the Race

for event in events:
    # Filter data by event and drop missing values
    event_data = lap_data[lap_data['EventName'] == event].dropna(subset=['Position', 'LapNumber'])

    # Filter to only include top 5 drivers based on final position in the event
    top_drivers = event_data.groupby('Driver')['Position'].min().nsmallest(5).index

    # Plot Driver Position Throughout the Race for the current event
    plt.figure(figsize=(12, 6))

    for driver in top_drivers:
        driver_laps = event_data[event_data['Driver'] == driver]
        plt.plot(driver_laps['LapNumber'], driver_laps['Position'], label=driver, linewidth=2, marker='o')

    # Add title and labels
    plt.title(f'Top 5 Drivers Position Throughout the Race: {event}')
    plt.xlabel('Lap Number')
    plt.ylabel('Position')
    plt.gca().invert_yaxis()  # Invert y-axis so that 1st position is at the top
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.show()

In [None]:
# Tire Compound Performance

for event in events:
    # Filter event data and drop missing values
    event_data = lap_data[lap_data['EventName'] == event].dropna(subset=['LapTimeSeconds', 'Compound'])

    # Create a boxplot to show the distribution of lap times for each tire compound
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Compound', y='LapTimeSeconds', data=event_data, palette='Set3')

    # Add title and labels
    plt.title(f'Tire Compound Performance: Lap Time Distribution by Tire Compound - {event}')
    plt.xlabel('Tire Compound')
    plt.ylabel('Lap Time (seconds)')
    plt.grid(True)
    plt.show()

In [None]:
# Tire Compound Performance vs. Driver vs. Event

# Convert LapTime from object to timedelta for proper analysis
# lap_data['LapTime'] = pd.to_timedelta(lap_data['LapTime'])

# Convert LapTime to seconds for easier plotting
# lap_data['LapTimeSeconds'] = lap_data['LapTime'].dt.total_seconds()

# Get unique events
# events = lap_data['EventName'].unique()

# Loop through each event and plot tire compound performance for all drivers in one plot
for event in events:
    event_data = lap_data[lap_data['EventName'] == event]
    
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='Compound', y='LapTimeSeconds', hue='Driver', data=event_data, palette='Set3', showfliers=False)
    
    # Add title and labels
    plt.title(f'Tire Compound Performance: Lap Time Distribution by Tire Compound for All Drivers in {event}')
    plt.xlabel('Tire Compound')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Tire Wear: Examine how lap times are affected by tire life
for event in events:
    # Filter data by event and drop missing values
    event_data = lap_data[lap_data['EventName'] == event].dropna(subset=['TyreLife', 'LapTimeSeconds', 'Compound'])

    # Create the scatter plot
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='TyreLife', y='LapTimeSeconds', hue='Compound', style='FreshTyre', data=event_data, alpha=0.7, palette='Set1')

    # Add title and labels
    plt.title(f'Tire Wear: Lap Time vs Tyre Life by Tire Compound and Fresh Tyre for {event}')
    plt.xlabel('Tire Life (Laps)')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Tire Compound & Fresh Tyre', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Ensure tight layout
    plt.tight_layout()

    # Show plot
    plt.show()

In [None]:
# Convert LapTime from object to timedelta for proper analysis
lap_data['LapTime'] = pd.to_timedelta(lap_data['LapTime'])

# Convert LapTime to seconds for easier plotting
lap_data['LapTimeSeconds'] = lap_data['LapTime'].dt.total_seconds()

# Get unique events
events = lap_data['EventName'].unique()

# Loop through each event and plot tire wear analysis
for event in events:
    event_data = lap_data[lap_data['EventName'] == event]
    
    plt.figure(figsize=(14, 8))
    sns.scatterplot(
        x='TyreLife', 
        y='LapTimeSeconds', 
        hue='Driver', 
        style='Compound', 
        size='FreshTyre', 
        data=event_data, 
        alpha=0.7, 
        palette='tab10',
        markers=['o', 's', '^', 'D', 'v', 'p', '*', 'H', 'X', '+'],
        sizes=(50, 200)
    )
    
    # Add title and labels
    plt.title(f'Tire Wear: Lap Time vs Tire Life by Driver and Tire Compound for {event}')
    plt.xlabel('Tire Life (Laps)')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Driver & Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Sector Times Comparison
# Convert sector times from object to timedelta for proper analysis
for sector in ['Sector1Time', 'Sector2Time', 'Sector3Time']:
    lap_data[sector] = pd.to_timedelta(lap_data[sector])

# Convert sector times to seconds for easier plotting
for sector in ['Sector1Time', 'Sector2Time', 'Sector3Time']:
    lap_data[f'{sector}Seconds'] = lap_data[sector].dt.total_seconds()

# Define sector names for easier reference
sectors = ['Sector1', 'Sector2', 'Sector3']

# Loop through each sector and plot distribution
for sector in sectors:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='EventName', y=f'{sector}TimeSeconds', data=lap_data, palette='Set2')
    
    # Add title and labels
    plt.title(f'{sector} Performance: Time Distribution Across Events')
    plt.xlabel('Event')
    plt.ylabel('Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Sector Times Comparison by Drivers
# Loop through each sector and plot distribution by driver
for sector in sectors:
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='EventName', y=f'{sector}TimeSeconds', hue='Driver', data=lap_data, palette='Set2')
    
    # Add title and labels
    plt.title(f'{sector} Performance: Time Distribution Across Events by Driver')
    plt.xlabel('Event')
    plt.ylabel('Time (seconds)')
    plt.xticks(rotation=45)
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Session Time Trends
# Convert session times to seconds for easier plotting
for sector in ['Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime']:
    lap_data[f'{sector}Seconds'] = lap_data[sector].dt.total_seconds()

# Plot sector times against session time
for sector in ['Sector1', 'Sector2', 'Sector3']:
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=f'{sector}SessionTimeSeconds', y=f'{sector}TimeSeconds', hue='Driver', data=lap_data, palette='Set1', alpha=0.7)
    
    # Add title and labels
    plt.title(f'{sector} Time vs Session Time')
    plt.xlabel('Session Time (seconds)')
    plt.ylabel('Sector Time (seconds)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Comparison of Sector Times Across Session
# Melt the dataframe for easier plotting
melted_lap_data = lap_data.melt(id_vars=['EventName', 'Driver', 'LapNumber'],
                                value_vars=['Sector1TimeSeconds', 'Sector2TimeSeconds', 'Sector3TimeSeconds'],
                                var_name='Sector',
                                value_name='SectorTime')

plt.figure(figsize=(12, 6))
sns.boxplot(x='Sector', y='SectorTime', data=melted_lap_data, palette='Set2')

# Add title and labels
plt.title('Sector Time Distribution Across All Sessions')
plt.xlabel('Sector')
plt.ylabel('Sector Time (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Identify Key Moments
# Calculate time gaps between sectors
lap_data['Sector1_Sector2_Gap'] = lap_data['Sector2SessionTimeSeconds'] - lap_data['Sector1SessionTimeSeconds']
lap_data['Sector2_Sector3_Gap'] = lap_data['Sector3SessionTimeSeconds'] - lap_data['Sector2SessionTimeSeconds']

# Plot time gaps
plt.figure(figsize=(12, 6))
sns.histplot(lap_data['Sector1_Sector2_Gap'], kde=True, bins=30, color='blue', label='Sector 1 to Sector 2')
sns.histplot(lap_data['Sector2_Sector3_Gap'], kde=True, bins=30, color='red', label='Sector 2 to Sector 3')

# Add title and labels
plt.title('Time Gaps Between Sectors')
plt.xlabel('Time Gap (seconds)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Time gap analysis
# Time gap between each sector
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Sector1TimeSeconds', y='Sector2TimeSeconds', hue='EventName', data=lap_data, alpha=0.7, palette='Set1')

# Add title and labels
plt.title('Sector 1 Time vs Sector 2 Time')
plt.xlabel('Sector 1 Time (seconds)')
plt.ylabel('Sector 2 Time (seconds)')
plt.legend(title='Event', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

# Repeat for Sector 2 vs Sector 3
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Sector2TimeSeconds', y='Sector3TimeSeconds', hue='EventName', data=lap_data, alpha=0.7, palette='Set1')

# Add title and labels
plt.title('Sector 2 Time vs Sector 3 Time')
plt.xlabel('Sector 2 Time (seconds)')
plt.ylabel('Sector 3 Time (seconds)')
plt.legend(title='Event', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Correlation Between Sector Times and Lap Times
# Calculate correlation coefficients
sector_times = ['Sector1TimeSeconds', 'Sector2TimeSeconds', 'Sector3TimeSeconds']
lap_times = 'LapTimeSeconds'

# Create a dataframe for correlation
correlation_df = lap_data[sector_times + [lap_times]]

# Calculate correlation matrix
correlation_matrix = correlation_df.corr()

# Extract relevant correlations
sector_lap_corr = correlation_matrix.loc[sector_times, lap_times]

print("Correlation between Sector Times and Lap Times:")
print(sector_lap_corr)

# Plot correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', linewidths=0.5)

# Add title and labels
plt.title('Correlation Between Sector Times and Lap Times')
plt.show()

In [None]:
# Scatter plots for each sector time vs lap time
for sector in sector_times:
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x=sector, y=lap_times, data=lap_data, hue='EventName', palette='tab20', alpha=0.7)
    
    # Add title and labels
    plt.title(f'{sector} vs Lap Time')
    plt.xlabel(f'{sector} (seconds)')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Event', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Speed and Track Performance
# Descriptive statistics for Speed at different track sections
speed_columns = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

# Melt the dataframe to long format for easier plotting
melted_lap_data = lap_data.melt(id_vars=['EventName', 'Driver'], value_vars=speed_columns,
                                var_name='TrackSection', value_name='Speed')

# Plot box plots for speed distribution at different track sections
plt.figure(figsize=(12, 6))
sns.boxplot(x='TrackSection', y='Speed', data=melted_lap_data, palette='Set2')

# Add title and labels
plt.title('Speed Distribution at Different Track Sections')
plt.xlabel('Track Section')
plt.ylabel('Speed (km/h)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Calculate mean speed for each track section per event
mean_speed_per_event = melted_lap_data.groupby(['EventName', 'TrackSection'])['Speed'].mean().unstack()

# Plot average speeds
mean_speed_per_event.plot(kind='bar', figsize=(14, 8), colormap='viridis')

# Add title and labels
plt.title('Average Speed at Different Track Sections by Event')
plt.xlabel('Event')
plt.ylabel('Average Speed (km/h)')
plt.legend(title='Track Section')
plt.xticks(rotation=45, ha='right')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Compare Speeds Across Drivers
# for first plot, Bahrain Grand Prix
event_name = 'Bahrain Grand Prix'  # Example event
event_data = lap_data[lap_data['EventName'] == event_name]

# Melt the dataframe to long format
melted_event_data = event_data.melt(id_vars=['Driver'], value_vars=speed_columns,
                                     var_name='TrackSection', value_name='Speed')

# Plot speeds for each driver at different track sections
plt.figure(figsize=(14, 8))
sns.lineplot(x='TrackSection', y='Speed', hue='Driver', data=melted_event_data, marker='o', palette="tab20")

# Add title and labels
plt.title(f'Speed Comparison at Different Track Sections - {event_name}')
plt.xlabel('Track Section')
plt.ylabel('Speed (km/h)')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Compare Speeds Across Drivers
# Get unique events
events = lap_data['EventName'].unique()

# Loop through each event and create plots
for event_name in events:
    # Filter data for the current event
    event_data = lap_data[lap_data['EventName'] == event_name]
    
    # Melt the dataframe to long format
    melted_event_data = event_data.melt(id_vars=['Driver'], value_vars=speed_columns,
                                         var_name='TrackSection', value_name='Speed')
    
    # Plot speeds for each driver at different track sections
    plt.figure(figsize=(14, 8))
    sns.lineplot(x='TrackSection', y='Speed', hue='Driver', data=melted_event_data, marker='o', palette="tab20")

    # Add title and labels
    plt.title(f'Speed Comparison at Different Track Sections - {event_name}')
    plt.xlabel('Track Section')
    plt.ylabel('Speed (km/h)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Speed vs Lap Time
# List of speed columns to explore
speed_columns = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

# Get unique events
events = lap_data['EventName'].unique()

# Loop through each event and create plots
for event_name in events:
    # Filter data for the current event
    event_data = lap_data[lap_data['EventName'] == event_name]
    
    # Melt the dataframe to long format
    melted_event_data = event_data.melt(id_vars=['Driver', 'LapTimeSeconds'], 
                                         value_vars=speed_columns,
                                         var_name='SpeedColumn', value_name='Speed')
    
    # Plot speeds vs lap time for all speed columns
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='Speed', y='LapTimeSeconds', hue='SpeedColumn', data=melted_event_data, palette='Set1', alpha=0.7)
    
    # Add title and labels
    plt.title(f'Speed vs Lap Time - {event_name}')
    plt.xlabel('Speed (km/h)')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Speed Column')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Plot Lap Times vs Track Status
for event_name in events:
    # Filter data for the current event
    event_data = lap_data[lap_data['EventName'] == event_name]
    
    # Plot Lap Times vs Track Status
    plt.figure(figsize=(14, 6))
    sns.boxplot(x='TrackStatus', y='LapTimeSeconds', data=event_data, palette='Set3')
    
    # Add title and labels
    plt.title(f'Lap Times vs Track Status - {event_name}')
    plt.xlabel('Track Status')
    plt.ylabel('Lap Time (seconds)')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
# Pit Stops
# Ensure PitInTime and PitOutTime are in the correct format
lap_data['PitInTime'] = pd.to_timedelta(lap_data['PitInTime'], errors='coerce')
lap_data['PitOutTime'] = pd.to_timedelta(lap_data['PitOutTime'], errors='coerce')

# Calculate Pit Stop Duration
lap_data['PitStopDuration'] = (lap_data['PitOutTime'] - lap_data['PitInTime']).dt.total_seconds()

In [None]:
# Visualize Lap Times Before and After Pit Stops

# Threshold for lap time (in seconds)
lap_time_threshold = 200  # adjust this to 200 or another value if needed

# Filter the lap data to exclude lap times above the threshold
filtered_lap_data = lap_data[lap_data['LapTimeSeconds'] <= lap_time_threshold]

# Create a new column to indicate if the lap is before or after a pit stop
filtered_lap_data.loc[:, 'PitStopPhase'] = filtered_lap_data['PitStopDuration'].apply(lambda x: 'After Pit Stop' if pd.notnull(x) else 'Before Pit Stop')

# Plot Lap Times Before and After Pit Stops
plt.figure(figsize=(12, 6))
sns.boxplot(x='PitStopPhase', y='LapTimeSeconds', data=filtered_lap_data, palette='Set3')

# Add title and labels
plt.title('Lap Times Before and After Pit Stops')
plt.xlabel('Pit Stop Phase')
plt.ylabel('Lap Time (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Plot Lap Times Before and After Pit Stops, including Tire Compound and Track Conditions
plt.figure(figsize=(14, 8))
sns.boxplot(x='PitStopPhase', y='LapTimeSeconds', hue='Compound', data=filtered_lap_data, palette='Set1')

# Add title and labels
plt.title('Lap Times Before and After Pit Stops by Tire Compound')
plt.xlabel('Pit Stop Phase')
plt.ylabel('Lap Time (seconds)')
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Lap Time Improvements After a Tire Change

# Make a copy of the original lap_data to preserve it
lap_data_copy = lap_data.copy()

# Sort data by Driver and LapNumber to ensure proper sequence
lap_data_copy = lap_data_copy.sort_values(by=['Driver', 'LapNumber'])

# Create a new column to identify whether the tire compound changed after a pit stop
lap_data_copy['TireChangeAfterPit'] = lap_data_copy.groupby('Driver')['Compound'].shift() != lap_data_copy['Compound']

# Threshold for lap time (in seconds)
lap_time_threshold = 200  # adjust this to 200 or another value if needed

# Filter the lap data to exclude lap times above the threshold
lap_data_copy = lap_data_copy[lap_data_copy['LapTimeSeconds'] <= lap_time_threshold]

# Calculate the lap time before the pit stop (shifted by 1 lap for comparison)
lap_data_copy['LapTimeBefore'] = lap_data_copy.groupby('Driver')['LapTimeSeconds'].shift(1)

# Drop rows where LapTimeBefore is missing (e.g., the first lap for each driver)
lap_data_copy = lap_data_copy.dropna(subset=['LapTimeBefore'])

# Calculate the lap time improvement: LapTimeSeconds after pit stop compared to before
lap_data_copy['LapTimeImprovement'] = lap_data_copy['LapTimeBefore'] - lap_data_copy['LapTimeSeconds']

# Filter for laps after a pit stop and where tire change occurred
post_pit_data = lap_data_copy[pd.notnull(lap_data_copy['PitOutTime'])]

# 1. Lap Time Improvement After Tire Change
plt.figure(figsize=(12, 6))
sns.boxplot(x='Compound', y='LapTimeImprovement', data=lap_data_copy[lap_data_copy['TireChangeAfterPit']], palette='Set2')

# Add title and labels
plt.title('Lap Time Improvement After Tire Change')
plt.xlabel('Tire Compound')
plt.ylabel('Lap Time Improvement (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

# 2. Compound Performance After Pit Stops
plt.figure(figsize=(12, 6))
sns.boxplot(x='Compound', y='LapTimeSeconds', data=post_pit_data, palette='Set3')

# Add title and labels
plt.title('Compound Performance After Pit Stops')
plt.xlabel('Tire Compound')
plt.ylabel('Lap Time (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Create a new column in the copy indicating whether the tire compound changed after a pit stop
lap_data_copy['TireChangeAfterPit'] = lap_data_copy.groupby('Driver')['Compound'].shift() != lap_data_copy['Compound']

# Filter only the laps right after a pit stop (where PitOutTime is not null)
pit_stop_data = lap_data_copy[pd.notnull(lap_data_copy['PitOutTime'])]

# Plot tire changes after pit stops
plt.figure(figsize=(12, 6))
sns.countplot(x='TireChangeAfterPit', hue='Compound', data=pit_stop_data, palette='Set1')

# Add title and labels
plt.title('Tire Compound Changes After Pit Stops')
plt.xlabel('Tire Changed After Pit Stop')
plt.ylabel('Count of Pit Stops')
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Speed Features Correlation Analysis

# Create a copy of lap_data to avoid modifying the original data
# lap_data_copy = lap_data.copy()

# Convert timedelta columns to seconds for correlation analysis
lap_data_copy['LapTimeSeconds'] = lap_data_copy['LapTime'].dt.total_seconds()
lap_data_copy['Sector1TimeSeconds'] = lap_data_copy['Sector1Time'].dt.total_seconds()
lap_data_copy['Sector2TimeSeconds'] = lap_data_copy['Sector2Time'].dt.total_seconds()
lap_data_copy['Sector3TimeSeconds'] = lap_data_copy['Sector3Time'].dt.total_seconds()

# List of numerical columns for correlation analysis
numeric_columns = [
    'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 
    'LapTimeSeconds', 'Sector1TimeSeconds', 'Sector2TimeSeconds', 'Sector3TimeSeconds',
    'Position', 'TyreLife'
]

# Convert columns to numeric and handle errors
for column in numeric_columns:
    lap_data_copy[column] = pd.to_numeric(lap_data_copy[column], errors='coerce')

# Drop rows with NaN values in numerical columns
lap_data_clean = lap_data_copy.dropna(subset=numeric_columns)

# Calculate correlations
corr_matrix = lap_data_clean[numeric_columns].corr()

# Plot heatmap of correlations
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

# Add title
plt.title('Heatmap of Correlations Between Speed and Time Features')
plt.tight_layout()
plt.show()

In [None]:
# Stint Comparison by Driver and Event
# Get unique events and drivers
events = lap_data['EventName'].unique()
drivers = lap_data['Driver'].unique()

# Loop through each event
for event in events:
    # Filter data for the specific event
    event_data = lap_data[lap_data['EventName'] == event]
    
    # Loop through each driver
    for driver in drivers:
        # Filter data for the specific driver
        driver_data = event_data[event_data['Driver'] == driver]
        
        # Filter out rows with NaN lap times and stints
        driver_data = driver_data.dropna(subset=['LapTime', 'Stint'])
        
        # Calculate lap time (in seconds)
        driver_data['LapTimeSeconds'] = driver_data['LapTime'].dt.total_seconds()
        
        # Create a plot comparing lap times across stints for each driver
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Stint', y='LapTimeSeconds', data=driver_data, palette='Set2')
        
        # Add title and labels
        plt.title(f'Lap Times Comparison by Stint for {driver} - {event}')
        plt.xlabel('Stint')
        plt.ylabel('Lap Time (seconds)')
        plt.grid(True)
        
        # Show plot
        plt.tight_layout()
        plt.show()

In [None]:
# Lap Time Trends Across Stints

# Filter data for each event and plot lap times across stints
events = lap_data['EventName'].unique()

for event in events:
    event_data = lap_data[lap_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=event_data, x='Stint', y='LapTimeSeconds', hue='Driver', marker='o', palette='tab20')
    
    plt.title(f'Lap Time Trends Across Stints - {event}')
    plt.xlabel('Stint Number')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Stint Performance Comparison by Driver
# Filter data for each event and compare drivers' average lap times by stint
for event in events:
    event_data = lap_data[lap_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=event_data, x='Stint', y='LapTimeSeconds', hue='Driver', palette='Set1')
    
    plt.title(f'Stint Performance Comparison by Driver - {event}')
    plt.xlabel('Stint Number')
    plt.ylabel('Lap Time (seconds)')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Analyze if longer stints lead to worse performance

# Correct stint length calculation: group by driver, event, and stint
lap_data['StintLength'] = lap_data.groupby(['Driver', 'EventName', 'Stint'])['LapNumber'].transform('count')

# Set a reasonable threshold for lap times (e.g., below 200 seconds)
filtered_data = lap_data[lap_data['LapTimeSeconds'] < 200]

# Plot: Impact of Stint Length on Lap Times (after filtering out outliers)
plt.figure(figsize=(12, 6))
sns.scatterplot(data=filtered_data, x='StintLength', y='LapTimeSeconds', hue='Driver', palette='tab20', alpha=0.7)

# Add titles and labels
plt.title('Impact of Stint Length on Lap Times (Filtered)')
plt.xlabel('Stint Length (laps)')
plt.ylabel('Lap Time (seconds)')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')

# Add grid and adjust layout
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

Exploratory Data Analysis (EDA)

Total Number of Pit Stops
For each driver and team in each race of the 2024 F1 season, determine the total number of pit stops made. Identify any patterns or trends in pit stop frequency.


In [None]:
# Total Number of Pit Stops for each driver and team

# Filter for valid pit stops (where PitInTime is not null)
pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
pit_stop_counts = (
    pit_stops.groupby(['EventName', 'Driver', 'Team'])
    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
)

# Optional: Sort the result by TotalPitStops to see the most frequent pit stoppers
sorted_pit_stops = pit_stop_counts.sort_values(by='TotalPitStops', ascending=False)

# Plotting Total Pit Stops by Driver
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_stops, x='EventName', y='TotalPitStops', hue='Driver', palette='tab20', dodge=True)

# Add labels and title for drivers
plt.title('Total Number of Pit Stops by Driver Across Events')
plt.xlabel('Event Name')
plt.ylabel('Total Pit Stops')
plt.xticks(rotation=45)
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Plotting Total Pit Stops by Team
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_stops, x='EventName', y='TotalPitStops', hue='Team', palette='tab10', dodge=True)

# Add labels and title for teams
plt.title('Total Number of Pit Stops by Team Across Events')
plt.xlabel('Event Name')
plt.ylabel('Total Pit Stops')
plt.xticks(rotation=45)
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Total Number of Pit Stops for each driver and team

# Filter for valid pit stops where both PitInTime and PitOutTime are not null
pit_stops = lap_data.dropna(subset=['PitOutTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
pit_stop_counts = (
    pit_stops.groupby(['EventName', 'Driver', 'Team'])
    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
)

# Optional: Sort the result by TotalPitStops to see the most frequent pit stoppers
sorted_pit_stops = pit_stop_counts.sort_values(by='TotalPitStops', ascending=False)

# Plotting Total Pit Stops by Driver
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_stops, x='EventName', y='TotalPitStops', hue='Driver', palette='tab20', dodge=True)

# Add labels and title for drivers
plt.title('Total Number of Pit Stops by Driver Across Events')
plt.xlabel('Event Name')
plt.ylabel('Total Pit Stops')
plt.xticks(rotation=45)
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Plotting Total Pit Stops by Team
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_stops, x='EventName', y='TotalPitStops', hue='Team', palette='tab10', dodge=True)

# Add labels and title for teams
plt.title('Total Number of Pit Stops by Team Across Events')
plt.xlabel('Event Name')
plt.ylabel('Total Pit Stops')
plt.xticks(rotation=45)
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Total Number of Pit Stops 

# Filter for valid pit stops (where PitInTime is not null)
# pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
# pit_stop_counts = (
#    pit_stops.groupby(['EventName', 'Driver', 'Team'])
#    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
#    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
#)

# Display the total number of pit stops for each driver and team in each event
print(pit_stop_counts)

# Optional: Sort the result by TotalPitStops to see the most frequent pit stoppers
sorted_pit_stops = pit_stop_counts.sort_values(by='TotalPitStops', ascending=False)

# Display sorted data
print(sorted_pit_stops)

sorted_pit_stops.tail(40)
# Plotting pit stop trends across events

plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_stops, x='EventName', y='TotalPitStops', hue='Team', dodge=True, palette='tab10')

# Add labels and title
plt.title('Total Number of Pit Stops by Driver and Team Across Events')
plt.xlabel('Event Name')
plt.ylabel('Total Pit Stops')
plt.xticks(rotation=45)
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Total Number of Pit Stops for Each Driver by Event

# Filter for valid pit stops (where PitInTime is not null)
# pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
# pit_stop_counts = (
#    pit_stops.groupby(['EventName', 'Driver', 'Team'])
#    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
#    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
#)

# Get unique events for separate plots
events = pit_stop_counts['EventName'].unique()

# Loop through each event to create separate plots
for event in events:
    event_data = pit_stop_counts[pit_stop_counts['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=event_data, x='Driver', y='TotalPitStops', hue='Team', palette='tab20', dodge=True)
    
    # Add labels and title for each event
    plt.title(f'Total Number of Pit Stops by Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Total Pit Stops')
    plt.xticks(rotation=45)
    plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Tables

# 1. Total Number of Pit Stops by Driver
driver_pit_stops = (
    pit_stops.groupby(['Driver'])
    .size()  # size() counts the number of pit stops per driver
    .reset_index(name='TotalPitStops')
    .sort_values(by='TotalPitStops', ascending=False)  # Sort by most pit stops
)

# Display the table for drivers
print("Total Number of Pit Stops by Driver:")
print(driver_pit_stops)

# 2. Total Number of Pit Stops by Team
team_pit_stops = (
    pit_stops.groupby(['Team'])
    .size()  # size() counts the number of pit stops per team
    .reset_index(name='TotalPitStops')
    .sort_values(by='TotalPitStops', ascending=False)  # Sort by most pit stops
)

# Display the table for teams
print("\nTotal Number of Pit Stops by Team:")
print(team_pit_stops)

# Optional: format the table more nicely for better readability in markdown-like environments
from tabulate import tabulate

# Display driver table
print("\nFormatted Table for Drivers:")
print(tabulate(driver_pit_stops, headers='keys', tablefmt='pretty'))

# Display team table
print("\nFormatted Table for Teams:")
print(tabulate(team_pit_stops, headers='keys', tablefmt='pretty'))

Tire Compounds Used During the Race
Identify the tire compounds used by each driver in every race. Create visualizations to compare the tire compound strategies across different teams and drivers.

In [None]:
# Tire Compounds Used During the Race

# Extract relevant columns for analysis
tire_data = lap_data[['EventName', 'Driver', 'Team', 'Compound']].copy()

# Filter out any rows where Compound might be missing
tire_data = tire_data.dropna(subset=['Compound'])

# Count the occurrences of each compound used by driver and event
compound_usage = (
    tire_data.groupby(['EventName', 'Driver', 'Team', 'Compound'])
    .size()
    .reset_index(name='Count')
)

# Display the compound usage data
print(compound_usage)

# Plotting Tire Compounds Usage by Driver
plt.figure(figsize=(14, 7))
sns.barplot(data=compound_usage, x='EventName', y='Count', hue='Driver', palette='Set2', dodge=True)

# Add labels and title
plt.title('Tire Compounds Used by Each Driver Across Events')
plt.xlabel('Event Name')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45)
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Plotting Tire Compounds Usage by Team
plt.figure(figsize=(14, 7))
sns.barplot(data=compound_usage, x='EventName', y='Count', hue='Team', palette='Set1', dodge=True)

# Add labels and title
plt.title('Tire Compounds Used by Each Team Across Events')
plt.xlabel('Event Name')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45)
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Group by EventName, Driver, and Compound for Driver plot
compound_usage_driver = (
    lap_data.groupby(['EventName', 'Driver', 'Compound'])
    .size()
    .reset_index(name='Count')
)

# Group by EventName, Team, and Compound for Team plot, summing the counts across drivers in the same team
compound_usage_team = (
    lap_data.groupby(['EventName', 'Team', 'Compound'])
    .size()
    .reset_index(name='TotalCount')
)

# Plotting Tire Compounds Usage by Driver with Compound as hue
plt.figure(figsize=(14, 7))
sns.barplot(data=compound_usage_driver, x='EventName', y='Count', hue='Compound', palette='Set2', dodge=True)

# Add labels and title for drivers
plt.title('Tire Compounds Used by Each Driver Across Events')
plt.xlabel('Event Name')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45)
plt.legend(title='Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Plotting Tire Compounds Usage by Team with Compound as hue
plt.figure(figsize=(14, 7))
sns.barplot(data=compound_usage_team, x='EventName', y='TotalCount', hue='Compound', palette='Set1', dodge=True)

# Add labels and title for teams
plt.title('Tire Compounds Used by Each Team Across Events (Aggregated)')
plt.xlabel('Event Name')
plt.ylabel('Total Number of Occurrences')
plt.xticks(rotation=45)
plt.legend(title='Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Create a pivot table for drivers and events to show the compound usage
compound_pivot_driver = compound_usage.pivot_table(index='Driver', columns='EventName', values='Count', aggfunc='sum', fill_value=0)
# print(compound_pivot_driver)

# Plot the heatmap for drivers
plt.figure(figsize=(14, 7))
sns.heatmap(compound_pivot_driver, cmap='YlGnBu', annot=True, fmt="d")

# Add labels and title
plt.title('Heatmap of Tire Compound Usage by Driver Across Events')
plt.xlabel('Event Name')
plt.ylabel('Driver')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create a pivot table for teams and events to show the compound usage
compound_pivot_team = compound_usage.pivot_table(index='Team', columns='EventName', values='Count', aggfunc='sum', fill_value=0)
# print(compound_pivot_team)

# Plot the heatmap for teams
plt.figure(figsize=(14, 7))
sns.heatmap(compound_pivot_team, cmap='YlGnBu', annot=True, fmt="d")

# Add labels and title
plt.title('Heatmap of Tire Compound Usage by Team Across Events')
plt.xlabel('Event Name')
plt.ylabel('Team')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Pivot table for drivers and events to show the compound usage
compound_pivot_driver = compound_usage.pivot_table(
    index='Driver', 
    columns='EventName', 
    values='Count', 
    aggfunc='sum', 
    fill_value=0
)
print(compound_pivot_driver)
compound_pivot_team = compound_usage.pivot_table(
    index='Team', 
    columns='EventName', 
    values='Count', 
    aggfunc='sum', 
    fill_value=0
)
print(compound_pivot_team)

In [None]:
# Tire Compounds Used During the Race

# Filter data for valid tire compound information
compound_data = lap_data[['EventName', 'Driver', 'Team', 'Compound']].dropna(subset=['Compound'])

# Get unique events
events = compound_data['EventName'].unique()

# Loop through each event and plot tire compounds used by each team
for event in events:
    event_data = compound_data[compound_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.countplot(data=event_data, x='Team', hue='Compound', palette='Set2')
    
    plt.title(f'Tire Compounds Used by Each Team - {event}')
    plt.xlabel('Team')
    plt.ylabel('Count of Tire Compounds Used')
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Tire Compounds Used During the Race

# Filter data for valid tire compound information
compound_data = lap_data[['EventName', 'Driver', 'Team', 'Compound']].dropna(subset=['Compound'])

# Get unique events
events = compound_data['EventName'].unique()

# Loop through each event and plot tire compounds used by each team
for event in events:
    event_data = compound_data[compound_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.countplot(data=event_data, x='Driver', hue='Compound', palette='Set2')
    
    plt.title(f'Tire Compounds Used by Each Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Count of Tire Compounds Used')
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Filter data for valid tire compound information
compound_data = lap_data[['EventName', 'Driver', 'Team', 'Compound']].dropna(subset=['Compound'])

# Create a pivot table to summarize total tire compounds by Driver and Event
compound_summary = pd.crosstab(
    index=[compound_data['EventName'], compound_data['Driver']],  # Rows: Event and Driver
    columns=compound_data['Compound'],  # Columns: Tire Compounds
    margins=True,  # Adds a 'Total' row and column
    margins_name='Total'  # Name for the total row/column
)

# Display the summary table
print(compound_summary)

# (Optional) Save the summary table to a CSV file for further analysis
# compound_summary.to_csv('tire_compound_summary.csv')

In [None]:
# Filter data for valid tire compound information
compound_data = lap_data[['EventName', 'Driver', 'Compound']].dropna(subset=['Compound'])

# Create a pivot table where rows are EventName and Compound, and columns are Driver
compound_summary = pd.crosstab(
    index=[compound_data['EventName'], compound_data['Compound']],  # Rows: Event and Compound
    columns=compound_data['Driver'],  # Columns: Drivers
    margins=True,  # Adds a 'Total' row and column
    margins_name='Total'  # Name for the total row/column
)

# Display the summary table
print(compound_summary)

# Save the summary table to a CSV file for further analysis
# compound_summary.to_csv('tire_compound_summary_by_event_and_driver.csv')

Number of Laps Completed on Each Tire Compound


Analyze the number of laps completed by each driver on each tire compound during their stints. 
Use visuals to illustrate the relationship between stint length and tire compound usage.

In [None]:
# Number of Laps Completed on Each Tire Compound

# Group by EventName, Driver, Compound, and Stint to calculate the number of laps per stint for each compound
laps_per_compound = (
    lap_data.groupby(['EventName', 'Driver', 'Compound', 'Stint'])
    .agg({'LapNumber': 'count'})  # Count the number of laps in each stint for each compound
    .reset_index()  # Reset the index
)

# Rename the 'LapNumber' column to 'StintLength'
laps_per_compound.rename(columns={'LapNumber': 'StintLength'}, inplace=True)

# Get unique events for separate plots
events = laps_per_compound['EventName'].unique()

# Loop through each event to create separate plots
for event in events:
    event_data = laps_per_compound[laps_per_compound['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    
    # Plotting the stint length for each tire compound used by each driver
    sns.barplot(data=event_data, x='Driver', y='StintLength', hue='Compound', dodge=True, palette='Set2')
    
    # Add labels and title for each event
    plt.title(f'Relationship Between Stint Length and Tire Compound Usage - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Stint Length (Laps)')
    plt.xticks(rotation=45)
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Group by EventName and Compound to calculate the total number of laps for each compound
laps_per_compound = (
    lap_data.groupby(['EventName', 'Compound'])
    .agg({'LapNumber': 'count'})  # Count the total number of laps for each compound in each event
    .reset_index()  # Reset the index to create a flat DataFrame
)

# Plotting the total number of laps for each tire compound by event
plt.figure(figsize=(14, 8))

# Use a barplot to display total lap counts
sns.barplot(x='Compound', y='LapNumber', hue='EventName', data=laps_per_compound, palette='tab20')

# Add labels and title
plt.title('Total Lap Numbers for Each Tire Compound by Event', fontsize=16)
plt.xlabel('Tire Compound', fontsize=12)
plt.ylabel('Total Lap Numbers', fontsize=12)

# Display the legend outside the plot for better readability
plt.legend(title='Event', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.tight_layout()
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
# Group by EventName and Compound to calculate the total number of laps for each compound
laps_per_compound = (
    lap_data.groupby(['EventName', 'Compound'])
    .agg({'LapNumber': 'count'})  # Count the total number of laps for each compound in each event
    .reset_index()  # Reset the index to create a flat DataFrame
)

# Plotting total lap numbers for each tire compound grouped by event
plt.figure(figsize=(14, 8))

# Use a barplot to display total lap counts, with EventName on the x-axis and Compound as hue
sns.barplot(x='EventName', y='LapNumber', hue='Compound', data=laps_per_compound, palette='Set2')

# Add labels and title
plt.title('Total Lap Numbers for Each Tire Compound by Event', fontsize=16)
plt.xlabel('Event', fontsize=12)
plt.ylabel('Total Lap Numbers', fontsize=12)

# Display the legend outside the plot for better readability
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Add grid for better visual clarity
plt.grid(True)

# Adjust the layout to prevent clipping of labels
plt.tight_layout()

# Show plot
plt.show()

Average Lap Time per Stint and Delta Time per Tire Compound
Calculate the average lap time for each stint. Visualize the changes in lap times within each stint to understand how tire performance evolved over the course of a race.


In [None]:
# Average Lap Time per Stint and Delta Time per Tire Compound - Tables

# Calculate average lap time per stint for each driver and tire compound
stint_avg_lap_time = (
    lap_data.groupby(['EventName', 'Driver', 'Team', 'Stint', 'Compound'])
    .agg({'LapTimeSeconds': 'mean'})  # Average lap time for each stint
    .reset_index()
    .rename(columns={'LapTimeSeconds': 'AvgLapTimeSeconds'})  # Rename for clarity
)

# Calculate the delta time (lap time difference) within each stint for each tire compound
lap_data['DeltaTime'] = lap_data.groupby(['EventName', 'Driver', 'Stint'])['LapTimeSeconds'].diff()

# Get unique events for separate plots
events = stint_avg_lap_time['EventName'].unique()

# Visualize average lap time per stint, delta time for each event, and display tables
for event in events:
    event_data_avg = stint_avg_lap_time[stint_avg_lap_time['EventName'] == event]
    event_data_delta = lap_data[(lap_data['EventName'] == event) & (lap_data['DeltaTime'].notna())]

    # Show tables for average lap time per stint by driver and by team
    driver_table = event_data_avg.groupby(['Driver', 'Stint'])['AvgLapTimeSeconds'].mean().unstack().fillna('-')
    team_table = event_data_avg.groupby(['Team', 'Stint'])['AvgLapTimeSeconds'].mean().unstack().fillna('-')

    print(f"\nAverage Lap Time per Stint for Drivers - {event}")
    print(driver_table)

    print(f"\nAverage Lap Time per Stint for Teams - {event}")
    print(team_table)

    # Plot Average Lap Time per Stint (Bar Plot)
    plt.figure(figsize=(12, 6))
    sns.barplot(data=event_data_avg, x='Driver', y='AvgLapTimeSeconds', hue='Compound', dodge=True, palette='Set2')
    plt.title(f'Average Lap Time per Stint - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Average Lap Time (seconds)')
    plt.xticks(rotation=45)
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plot Delta Time per Stint (Line Plot)
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=event_data_delta, x='LapNumber', y='DeltaTime', hue='Compound', style='Driver', marker='o', palette='Set2')
    plt.title(f'Delta Lap Time per Tire Compound - {event}')
    plt.xlabel('Lap Number')
    plt.ylabel('Delta Lap Time (seconds)')
    plt.xticks(rotation=45)
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Average Lap Time per Stint Across All Events - Tables

# Calculate average lap time per stint for each driver and team across all events
stint_avg_lap_time_all_events = (
    lap_data.groupby(['Driver', 'Team', 'Stint', 'Compound'])
    .agg({'LapTimeSeconds': 'mean'})  # Average lap time for each stint
    .reset_index()
    .rename(columns={'LapTimeSeconds': 'AvgLapTimeSeconds'})  # Rename for clarity
)

# Calculate the delta time (lap time difference) within each stint for each tire compound across all events
lap_data['DeltaTime'] = lap_data.groupby(['Driver', 'Stint'])['LapTimeSeconds'].diff()

# Show tables for average lap time per stint by driver and by team across all events
driver_stint_table_all_events = stint_avg_lap_time_all_events.groupby(['Driver', 'Stint'])['AvgLapTimeSeconds'].mean().unstack().fillna('-')
team_stint_table_all_events = stint_avg_lap_time_all_events.groupby(['Team', 'Stint'])['AvgLapTimeSeconds'].mean().unstack().fillna('-')

print("\nAverage Lap Time per Stint for Drivers Across All Events")
print(driver_stint_table_all_events)

print("\nAverage Lap Time per Stint for Teams Across All Events")
print(team_stint_table_all_events)

# Plot Average Lap Time per Stint by Driver Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=stint_avg_lap_time_all_events, x='Driver', y='AvgLapTimeSeconds', hue='Compound', dodge=True, palette='Set2')
plt.title('Average Lap Time per Stint by Driver Across All Events')
plt.xlabel('Driver')
plt.ylabel('Average Lap Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot Average Lap Time per Stint by Team Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=stint_avg_lap_time_all_events, x='Team', y='AvgLapTimeSeconds', hue='Compound', dodge=True, palette='Set1')
plt.title('Average Lap Time per Stint by Team Across All Events')
plt.xlabel('Team')
plt.ylabel('Average Lap Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Calculate and plot Delta Time (Lap Time Difference) Across All Events
lap_data_delta = lap_data[lap_data['DeltaTime'].notna()]

plt.figure(figsize=(12, 6))
sns.lineplot(data=lap_data_delta, x='LapNumber', y='DeltaTime', hue='Compound', style='Driver', marker='o', palette='Set2')
plt.title('Delta Lap Time per Tire Compound Across All Events')
plt.xlabel('Lap Number')
plt.ylabel('Delta Lap Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Group by EventName, Driver, and Compound to calculate the average lap time for each combination

# Set a threshold for lap time (in seconds)
lap_time_threshold = 200  # Adjust this value as needed

# Filter the lap data to include only lap times below the threshold
filtered_lap_data = lap_data[lap_data['LapTimeSeconds'] <= lap_time_threshold]

average_lap_time = (
    filtered_lap_data.groupby(['EventName', 'Driver', 'Compound'])
    .agg({'LapTimeSeconds': 'mean'})  # Calculate the average lap time
    .reset_index()  # Reset the index to create a flat DataFrame
)

# Plotting average lap time for each tire compound grouped by event and driver
plt.figure(figsize=(14, 8))

# Use a barplot to display average lap times, with EventName and Driver on the x-axis and Compound as hue
sns.barplot(x='EventName', y='LapTimeSeconds', hue='Compound', data=average_lap_time, palette='Set2')

# Add labels and title
plt.title('Average Lap Time by Tire Compound for Each Event and Driver', fontsize=16)
plt.xlabel('Event', fontsize=12)
plt.ylabel('Average Lap Time (seconds)', fontsize=12)

# Display the legend outside the plot for better readability
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=90)

# Add grid for better visual clarity
plt.grid(True)

# Adjust the layout to prevent clipping of labels
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# Group by EventName, Stint, and Compound to calculate the average lap time per stint
stint_avg_lap_time = (
    lap_data.groupby(['EventName', 'Stint', 'Compound'])
    .agg({'LapTimeSeconds': 'mean'})  # Calculate average lap time in each stint
    .reset_index()
)

# Plotting average lap time per stint for each event
plt.figure(figsize=(14, 8))

# Use a barplot to display average lap times, with EventName and Stint on the x-axis and Compound as hue
sns.barplot(data=stint_avg_lap_time, x='Stint', y='LapTimeSeconds', hue='Compound', palette='Set2', errorbar=None)

# Add labels and title
plt.title('Average Lap Time per Stint for Each Event', fontsize=16)
plt.xlabel('Stint Number', fontsize=12)
plt.ylabel('Average Lap Time (seconds)', fontsize=12)

# Display the legend outside the plot for better readability
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Add grid for better visual clarity
plt.grid(True)

# Adjust the layout to prevent clipping of labels
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# Calculate delta time per tire compound for each driver and team across all events
delta_time_per_compound = (
    lap_data.groupby(['Driver', 'Team', 'Compound'])
    .agg({'DeltaTime': 'mean'})  # Average delta time for each driver and compound
    .reset_index()
    .rename(columns={'DeltaTime': 'AvgDeltaTime'})  # Rename for clarity
)

# Show tables for delta time by driver and by team across all events
driver_delta_table_all_events = delta_time_per_compound.groupby(['Driver', 'Compound'])['AvgDeltaTime'].mean().unstack().fillna('-')
team_delta_table_all_events = delta_time_per_compound.groupby(['Team', 'Compound'])['AvgDeltaTime'].mean().unstack().fillna('-')

print("\nAverage Delta Time per Tire Compound for Drivers Across All Events")
print(driver_delta_table_all_events)

print("\nAverage Delta Time per Tire Compound for Teams Across All Events")
print(team_delta_table_all_events)

# Plot Average Delta Time per Tire Compound by Driver Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=delta_time_per_compound, x='Driver', y='AvgDeltaTime', hue='Compound', dodge=True, palette='Set2')
plt.title('Average Delta Time per Tire Compound by Driver Across All Events')
plt.xlabel('Driver')
plt.ylabel('Average Delta Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot Average Delta Time per Tire Compound by Team Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=delta_time_per_compound, x='Team', y='AvgDeltaTime', hue='Compound', dodge=True, palette='Set1')
plt.title('Average Delta Time per Tire Compound by Team Across All Events')
plt.xlabel('Team')
plt.ylabel('Average Delta Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Calculate delta time per tire compound for each driver and team across all events - DRY CONDITIONS

# Exclude INTERMEDIATE and WET compounds
lap_data_dry = lap_data[~lap_data['Compound'].isin(['INTERMEDIATE', 'WET'])]


delta_time_per_compound = (
    lap_data_dry.groupby(['Driver', 'Team', 'Compound'])
    .agg({'DeltaTime': 'mean'})  # Average delta time for each driver and compound
    .reset_index()
    .rename(columns={'DeltaTime': 'AvgDeltaTime'})  # Rename for clarity
)

# Show tables for delta time by driver and by team across all events
driver_delta_table_all_events = delta_time_per_compound.groupby(['Driver', 'Compound'])['AvgDeltaTime'].mean().unstack().fillna('-')
team_delta_table_all_events = delta_time_per_compound.groupby(['Team', 'Compound'])['AvgDeltaTime'].mean().unstack().fillna('-')

print("\nAverage Delta Time per Tire Compound for Drivers Across All Events")
print(driver_delta_table_all_events)

print("\nAverage Delta Time per Tire Compound for Teams Across All Events")
print(team_delta_table_all_events)

# Plot Average Delta Time per Tire Compound by Driver Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=delta_time_per_compound, x='Driver', y='AvgDeltaTime', hue='Compound', dodge=True, palette='Set2')
plt.title('Average Delta Time per Tire Compound by Driver Across All Events')
plt.xlabel('Driver')
plt.ylabel('Average Delta Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot Average Delta Time per Tire Compound by Team Across All Events
plt.figure(figsize=(12, 6))
sns.barplot(data=delta_time_per_compound, x='Team', y='AvgDeltaTime', hue='Compound', dodge=True, palette='Set1')
plt.title('Average Delta Time per Tire Compound by Team Across All Events')
plt.xlabel('Team')
plt.ylabel('Average Delta Time (seconds)')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


Time Spent in Pits.
Measure the total time spent in the pits by each driver during the races. Create visual representations to show how pit time varies across drivers and teams.

In [None]:
# Time Spent in Pits - Tables

# Copy lap_data to preserve the original data
pit_data1 = lap_data.copy()

# Shift PitOutTime to the correct row and ensure 'PitInTime' and 'PitOutTime' are in timedelta format
pit_data1['PitOutTime'] = pit_data1['PitOutTime'].shift(-1)
pit_data1['PitInTime'] = pd.to_timedelta(pit_data1['PitInTime'], errors='coerce')
pit_data1['PitOutTime'] = pd.to_timedelta(pit_data1['PitOutTime'], errors='coerce')

# Calculate the time spent in pits (PitOutTime - PitInTime)
pit_data1['PitTime'] = (pit_data1['PitOutTime'] - pit_data1['PitInTime']).dt.total_seconds()

# Filter out rows where PitInTime is null
pit_data1 = pit_data1.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to get total pit time in seconds
total_pit_time = pit_data1.groupby(['EventName', 'Driver', 'Team'])['PitTime'].sum().reset_index()

# Optional: Sort the result by PitTime to see the drivers/teams with the longest pit times
sorted_pit_times_driver = total_pit_time.sort_values(by='PitTime', ascending=False)

# Aggregate by team to get total pit time per team
sorted_pit_times_team = total_pit_time.groupby(['EventName', 'Team'])['PitTime'].sum().reset_index()

# Show tables for time spent in pits by driver and by team
for event in total_pit_time['EventName'].unique():
    driver_table = sorted_pit_times_driver[sorted_pit_times_driver['EventName'] == event].pivot_table(
        index='Driver', values='PitTime', aggfunc='sum'
    ).fillna('-')
    
    team_table = sorted_pit_times_team[sorted_pit_times_team['EventName'] == event].pivot_table(
        index='Team', values='PitTime', aggfunc='sum'
    ).fillna('-')
    
    print(f"\nTotal Pit Time by Driver - {event}")
    print(driver_table)

    print(f"\nTotal Pit Time by Team - {event}")
    print(team_table)

    # Plotting Total Pit Time by Driver
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_driver[sorted_pit_times_driver['EventName'] == event],
                x='Driver', y='PitTime', palette='tab20')
    plt.title(f'Total Pit Time by Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Total Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plotting Total Pit Time by Team
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_team[sorted_pit_times_team['EventName'] == event],
                x='Team', y='PitTime', palette='Set1')
    plt.title(f'Total Pit Time by Team - {event}')
    plt.xlabel('Team')
    plt.ylabel('Total Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Time Spent in Pits - Tables with Averages

# Copy lap_data to preserve the original data
pit_data1 = lap_data.copy()

# Shift PitOutTime to the correct row and ensure 'PitInTime' and 'PitOutTime' are in timedelta format
pit_data1['PitOutTime'] = pit_data1['PitOutTime'].shift(-1)
pit_data1['PitInTime'] = pd.to_timedelta(pit_data1['PitInTime'], errors='coerce')
pit_data1['PitOutTime'] = pd.to_timedelta(pit_data1['PitOutTime'], errors='coerce')

# Calculate the time spent in pits (PitOutTime - PitInTime)
pit_data1['PitTime'] = (pit_data1['PitOutTime'] - pit_data1['PitInTime']).dt.total_seconds()

# Filter out rows where PitInTime is null
pit_data1 = pit_data1.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to get total and average pit time in seconds
total_pit_time = pit_data1.groupby(['EventName', 'Driver', 'Team'])['PitTime'].agg(['sum', 'mean', 'count']).reset_index()
total_pit_time.rename(columns={'sum': 'TotalPitTime', 'mean': 'AvgPitTime', 'count': 'PitStops'}, inplace=True)

# Optional: Sort the result by TotalPitTime to see the drivers/teams with the longest pit times
sorted_pit_times_driver = total_pit_time.sort_values(by='TotalPitTime', ascending=False)

# Aggregate by team to get total and average pit time per team
sorted_pit_times_team = total_pit_time.groupby(['EventName', 'Team']).agg(
    {'TotalPitTime': 'sum', 'AvgPitTime': 'mean', 'PitStops': 'sum'}
).reset_index()

# Show tables for time spent in pits by driver and by team
for event in total_pit_time['EventName'].unique():
    driver_table = sorted_pit_times_driver[sorted_pit_times_driver['EventName'] == event].pivot_table(
        index='Driver', values=['TotalPitTime', 'AvgPitTime', 'PitStops'], aggfunc='sum'
    ).fillna('-')
    
    team_table = sorted_pit_times_team[sorted_pit_times_team['EventName'] == event].pivot_table(
        index='Team', values=['TotalPitTime', 'AvgPitTime', 'PitStops'], aggfunc='sum'
    ).fillna('-')
    
    print(f"\nTotal and Average Pit Time by Driver - {event}")
    print(driver_table)

    print(f"\nTotal and Average Pit Time by Team - {event}")
    print(team_table)

    # Plotting Total Pit Time by Driver
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_driver[sorted_pit_times_driver['EventName'] == event],
                x='Driver', y='TotalPitTime', palette='tab20')
    plt.title(f'Total Pit Time by Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Total Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plotting Average Pit Time by Driver
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_driver[sorted_pit_times_driver['EventName'] == event],
                x='Driver', y='AvgPitTime', palette='tab20')
    plt.title(f'Average Pit Time by Driver - {event}')
    plt.xlabel('Driver')
    plt.ylabel('Average Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plotting Total Pit Time by Team
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_team[sorted_pit_times_team['EventName'] == event],
                x='Team', y='TotalPitTime', palette='Set1')
    plt.title(f'Total Pit Time by Team - {event}')
    plt.xlabel('Team')
    plt.ylabel('Total Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Plotting Average Pit Time by Team
    plt.figure(figsize=(14, 7))
    sns.barplot(data=sorted_pit_times_team[sorted_pit_times_team['EventName'] == event],
                x='Team', y='AvgPitTime', palette='Set1')
    plt.title(f'Average Pit Time by Team - {event}')
    plt.xlabel('Team')
    plt.ylabel('Average Pit Time (seconds)')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Time Spent in Pits - Across All Events (Total and Average)

# Copy lap_data to preserve the original data
pit_data1 = lap_data.copy()

# Shift PitOutTime to the correct row and ensure 'PitInTime' and 'PitOutTime' are in timedelta format
pit_data1['PitOutTime'] = pit_data1['PitOutTime'].shift(-1)
pit_data1['PitInTime'] = pd.to_timedelta(pit_data1['PitInTime'], errors='coerce')
pit_data1['PitOutTime'] = pd.to_timedelta(pit_data1['PitOutTime'], errors='coerce')

# Calculate the time spent in pits (PitOutTime - PitInTime)
pit_data1['PitTime'] = (pit_data1['PitOutTime'] - pit_data1['PitInTime']).dt.total_seconds()

# Filter out rows where PitInTime is null
pit_data1 = pit_data1.dropna(subset=['PitInTime'])

# Set a reasonable threshold for pit times (e.g., below 100 seconds)
pit_data1 = pit_data1[pit_data1['PitTime'] < 100]

# Group by Driver and Team across all events to calculate total and average pit time
total_pit_time_all_events = pit_data1.groupby(['Driver', 'Team'])['PitTime'].agg(['sum', 'mean', 'count']).reset_index()
total_pit_time_all_events.rename(columns={'sum': 'TotalPitTime', 'mean': 'AvgPitTime', 'count': 'PitStops'}, inplace=True)

# Sort by TotalPitTime to get drivers with the longest total pit times
sorted_pit_times_driver_all_events = total_pit_time_all_events.sort_values(by='TotalPitTime', ascending=False)

# Aggregate by Team to get total and average pit time across all events
sorted_pit_times_team_all_events = total_pit_time_all_events.groupby(['Team']).agg(
    {'TotalPitTime': 'sum', 'AvgPitTime': 'mean', 'PitStops': 'sum'}
).reset_index()

# Show tables for time spent in pits by driver and by team across all events
print("\nTotal and Average Pit Time by Driver Across All Events")
print(sorted_pit_times_driver_all_events.pivot_table(index='Driver', values=['TotalPitTime', 'AvgPitTime', 'PitStops'], aggfunc='sum').fillna('-'))

print("\nTotal and Average Pit Time by Team Across All Events")
print(sorted_pit_times_team_all_events.pivot_table(index='Team', values=['TotalPitTime', 'AvgPitTime', 'PitStops'], aggfunc='sum').fillna('-'))

# Plotting Total Pit Time by Driver Across All Events
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_times_driver_all_events, x='Driver', y='TotalPitTime', palette='tab20')
plt.title('Total Pit Time by Driver Across All Events')
plt.xlabel('Driver')
plt.ylabel('Total Pit Time (seconds)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting Average Pit Time by Driver Across All Events
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_times_driver_all_events, x='Driver', y='AvgPitTime', palette='tab20')
plt.title('Average Pit Time by Driver Across All Events')
plt.xlabel('Driver')
plt.ylabel('Average Pit Time (seconds)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting Total Pit Time by Team Across All Events
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_times_team_all_events, x='Team', y='TotalPitTime', palette='Set1')
plt.title('Total Pit Time by Team Across All Events')
plt.xlabel('Team')
plt.ylabel('Total Pit Time (seconds)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting Average Pit Time by Team Across All Events
plt.figure(figsize=(14, 7))
sns.barplot(data=sorted_pit_times_team_all_events, x='Team', y='AvgPitTime', palette='Set1')
plt.title('Average Pit Time by Team Across All Events')
plt.xlabel('Team')
plt.ylabel('Average Pit Time (seconds)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


Relationship 1

Tire Compound Choice vs Lap Time

Task: Investigate how different tire compounds (e.g., Soft, Medium, Hard) correlate with lap times throughout the race.

Objective: Identify which tire compounds lead to faster lap times and whether this varies depending on the stint or race phase.

In [None]:
# Plot Lap Time vs Tire Compound

# Set a reasonable threshold for lap times (e.g., below 200 seconds)
lap_data_revised = lap_data[lap_data['LapTimeSeconds'] < 200]

plt.figure(figsize=(12, 6))
sns.scatterplot(data=lap_data_revised, x='Compound', y='LapTimeSeconds', hue='Compound', palette='Set1', alpha=0.7)

# Add title and labels
plt.title('Lap Time vs Tire Compound')
plt.xlabel('Tire Compound')
plt.ylabel('Lap Time (seconds)')
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Calculate average lap times by tire compound
avg_lap_time_by_compound = lap_data.groupby('Compound')['LapTimeSeconds'].mean().reset_index()

# Set a reasonable threshold for lap times (e.g., below 200 seconds)
avg_lap_time_by_compound = avg_lap_time_by_compound[avg_lap_time_by_compound['LapTimeSeconds'] < 200]

# Plot average lap time by tire compound
plt.figure(figsize=(12, 6))
sns.barplot(data=avg_lap_time_by_compound, x='Compound', y='LapTimeSeconds', palette='Set1')

# Add title and labels
plt.title('Average Lap Time by Tire Compound')
plt.xlabel('Tire Compound')
plt.ylabel('Average Lap Time (seconds)')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Tire Compound Choice vs Lap Time by Event

# Filter out rows with missing TireCompound or LapTimeSeconds
filtered_data = lap_data.dropna(subset=['Compound', 'LapTimeSeconds'])

# Get unique events
events = filtered_data['EventName'].unique()

# Plot Tire Compound vs Lap Time for each event
for event in events:
    event_data = filtered_data[filtered_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=event_data, x='Compound', y='LapTimeSeconds', palette='Set1')

    # Add labels and title
    plt.title(f'Tire Compound vs Lap Time - {event}')
    plt.xlabel('Tire Compound')
    plt.ylabel('Lap Time (seconds)')
    plt.grid(True)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Tire Compound vs Lap Time by Event and Driver

# Filter out rows with missing TireCompound or LapTimeSeconds
filtered_data = lap_data.dropna(subset=['Compound', 'LapTimeSeconds'])

# Get unique events and drivers
events = filtered_data['EventName'].unique()
drivers = filtered_data['Driver'].unique()

# Plot Tire Compound vs Lap Time for each event and driver
for event in events:
    for driver in drivers:
        # Filter data for the current event and driver
        event_driver_data = filtered_data[(filtered_data['EventName'] == event) & (filtered_data['Driver'] == driver)]
        
        if not event_driver_data.empty:  # Check if there's data to plot
            plt.figure(figsize=(12, 6))
            sns.boxplot(data=event_driver_data, x='Compound', y='LapTimeSeconds', palette='Set1')

            # Add labels and title
            plt.title(f'Tire Compound vs Lap Time - {event} - {driver}')
            plt.xlabel('Tire Compound')
            plt.ylabel('Lap Time (seconds)')
            plt.grid(True)

            # Adjust layout and show the plot
            plt.tight_layout()
            plt.show()


In [None]:
# Lap Time by Tire Compound for Each Driver
# Filter out rows with missing TireCompound or LapTimeSeconds
filtered_data = lap_data.dropna(subset=['Compound', 'LapTimeSeconds'])

# Create a plot for the relationship between Tire Compound and Lap Time by Driver
plt.figure(figsize=(14, 7))
sns.lineplot(data=filtered_data, x='Compound', y='LapTimeSeconds', hue='Driver', marker='o', palette='tab20')

# Add labels and title
plt.title('Lap Time by Tire Compound for Each Driver')
plt.xlabel('Tire Compound')
plt.ylabel('Lap Time (seconds)')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


In [None]:
# Lap Time by Tire Compound for Each Event

# Filter out rows with missing TireCompound or LapTimeSeconds
filtered_data = lap_data.dropna(subset=['Compound', 'LapTimeSeconds'])

# Create a plot for the relationship between Tire Compound and Lap Time by Event
plt.figure(figsize=(14, 7))
sns.lineplot(data=filtered_data, x='Compound', y='LapTimeSeconds', hue='EventName', marker='o', palette='tab20')

# Add labels and title
plt.title('Lap Time by Tire Compound for Each Event')
plt.xlabel('Tire Compound')
plt.ylabel('Lap Time (seconds)')
plt.legend(title='Event', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


Relationship 2

Starting Tire Type vs Final Classification

Task: Explore how starting the race on different tire compounds (Hard, Soft, Medium) impacts the final classification.

Objective: Understand whether the initial tire choice plays a significant role in determining race outcomes.

In [None]:
# Starting Tire Type vs Final Classification

# Ensure 'Compound' and 'Position' columns are not missing
filtered_data = lap_data.dropna(subset=['Compound', 'Position'])

# Plot Relationship between Starting Tire Type and Final Classification
plt.figure(figsize=(12, 6))
sns.boxplot(data=filtered_data, x='Compound', y='Position', palette='Set1')

# Add labels and title
plt.title('Starting Tire Type vs Final Classification')
plt.xlabel('Starting Tire Type')
plt.ylabel('Final Classification (Position)')
plt.gca().invert_yaxis()  # Invert y-axis to have lower positions at the top
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


In [None]:
# Relationship Between Starting Tire Type and Final Classification by Driver

# Ensure 'Compound' and 'Position' columns are not missing
# filtered_data = lap_data.dropna(subset=['Compound', 'Position'])

# Get unique drivers
drivers = filtered_data['Driver'].unique()

# Plot Relationship between Starting Tire Type and Final Classification for each driver
for driver in drivers:
    driver_data = filtered_data[filtered_data['Driver'] == driver]
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=driver_data, x='Compound', y='Position', palette='Set1')

    # Add labels and title
    plt.title(f'Starting Tire Type vs Final Classification - {driver}')
    plt.xlabel('Starting Tire Type')
    plt.ylabel('Final Classification (Position)')
    plt.gca().invert_yaxis()  # Invert y-axis to have lower positions at the top
    plt.grid(True)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


In [None]:
# Relationship Between Starting Tire Type and Final Classification by Event

# Ensure 'Compound' and 'Position' columns are not missing
filtered_data = lap_data.dropna(subset=['Compound', 'Position'])

# Get unique events
events = filtered_data['EventName'].unique()

# Plot Relationship between Starting Tire Type and Final Classification for each event
for event in events:
    event_data = filtered_data[filtered_data['EventName'] == event]
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=event_data, x='Compound', y='Position', palette='Set1')

    # Add labels and title
    plt.title(f'Starting Tire Type vs Final Classification - {event}')
    plt.xlabel('Starting Tire Type')
    plt.ylabel('Final Classification (Position)')
    plt.gca().invert_yaxis()  # Invert y-axis to have lower positions at the top
    plt.grid(True)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


In [None]:
# Relationship between Starting Tire Type and Final Classification for each event

# Ensure 'Compound' and 'Position' columns are not missing
filtered_data = lap_data.dropna(subset=['Compound', 'Position'])

# Get unique events
events = filtered_data['EventName'].unique()

# Plot Relationship between Starting Tire Type and Final Classification for each event
for event in events:
    event_data = filtered_data[filtered_data['EventName'] == event]
    
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=event_data, x='Compound', y='Position', hue='Driver', marker='o', palette='tab10')

    # Add labels and title
    plt.title(f'Starting Tire Type vs Final Classification - {event}')
    plt.xlabel('Starting Tire Type')
    plt.ylabel('Final Classification (Position)')
    plt.gca().invert_yaxis()  # Invert y-axis to have lower positions at the top
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
# Relationship between Starting Tire Type and Final Classification for each driver

# Ensure 'Compound' and 'Position' columns are not missing
filtered_data = lap_data.dropna(subset=['Compound', 'Position'])

# Get unique drivers
drivers = filtered_data['Driver'].unique()

# Plot Relationship between Starting Tire Type and Final Classification for each driver
for driver in drivers:
    driver_data = filtered_data[filtered_data['Driver'] == driver]
    
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=driver_data, x='EventName', y='Position', hue='Compound', marker='o', palette='Set1')

    # Add labels and title
    plt.title(f'Starting Tire Type vs Final Classification - {driver}')
    plt.xlabel('Event Name')
    plt.xticks(rotation=45)
    plt.ylabel('Final Classification (Position)')
    plt.gca().invert_yaxis()  # Invert y-axis to have lower positions at the top
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

Relationship 3

Number of Laps on a Compound vs Delta Time

Task: Explore the relationship between the number of laps driven on a particular tire compound and the delta time (change in lap times) over those laps.

Objective: Understand how tire wear affects performance, particularly how lap times degrade as tires wear out during a stint.

In [None]:
# Number of Laps on a Compound vs Delta Time

# Ensure the necessary columns are available
filtered_data = lap_data.dropna(subset=['Driver', 'EventName', 'LapTimeSeconds', 'Compound'])

# Sort the data by Driver, EventName, and LapTimeSeconds
filtered_data = filtered_data.sort_values(by=['Driver', 'EventName', 'LapTimeSeconds'])

# Calculate Delta Time (change in lap times)
filtered_data['DeltaTime'] = filtered_data.groupby(['Driver', 'EventName'])['LapTimeSeconds'].diff().fillna(0)

# Set a reasonable threshold for DeltaTime (e.g., below 20 seconds)
delta_time_threshold = 20  # adjust this value based on data
filtered_data = filtered_data[filtered_data['DeltaTime'] < delta_time_threshold]

# Group by Driver, EventName, TireCompound to count laps and calculate average delta time
summary = (
    filtered_data.groupby(['Driver', 'EventName', 'Compound'])
    .agg(
        NumLaps=('LapTimeSeconds', 'size'),
        AvgDeltaTime=('DeltaTime', 'mean')
    )
    .reset_index()
)

# Plot relationship between Number of Laps and Delta Time for each Tire Compound
plt.figure(figsize=(14, 8))
sns.scatterplot(data=summary, x='NumLaps', y='AvgDeltaTime', hue='Compound', palette='Set1', style='Compound')

# Add labels and title
plt.title('Relationship Between Number of Laps and Delta Time by Tire Compound')
plt.xlabel('Number of Laps')
plt.ylabel('Average Delta Time (seconds)')
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Number of Laps vs Average Delta Time by Event

# Ensure the necessary columns are available
filtered_data = lap_data.dropna(subset=['Driver', 'EventName', 'LapTimeSeconds', 'Compound'])

# Sort the data by Driver, EventName, and LapTimeSeconds
filtered_data = filtered_data.sort_values(by=['Driver', 'EventName', 'LapTimeSeconds'])

# Calculate Delta Time (change in lap times)
filtered_data['DeltaTime'] = filtered_data.groupby(['Driver', 'EventName'])['LapTimeSeconds'].diff().fillna(0)

# Set a reasonable threshold for DeltaTime (e.g., below 20 seconds)
delta_time_threshold = 20  # adjust this value based on data
filtered_data = filtered_data[filtered_data['DeltaTime'] < delta_time_threshold]

# Group by Driver, EventName, TireCompound to count laps and calculate average delta time
summary = (
    filtered_data.groupby(['Driver', 'EventName', 'Compound'])
    .agg(
        NumLaps=('LapTimeSeconds', 'size'),
        AvgDeltaTime=('DeltaTime', 'mean')
    )
    .reset_index()
)

# Plotting
plt.figure(figsize=(16, 8))

# Loop through each event to create separate plots
for event in summary['EventName'].unique():
    event_data = summary[summary['EventName'] == event]
    
    plt.figure(figsize=(14, 8))
    sns.scatterplot(data=event_data, x='NumLaps', y='AvgDeltaTime', hue='Compound', palette='Set1', style='Compound', marker='o')

    # Add labels and title
    plt.title(f'Number of Laps vs Average Delta Time - {event}')
    plt.xlabel('Number of Laps')
    plt.ylabel('Average Delta Time (seconds)')
    plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


Relationship 4

Number of Stops vs Final Positon
i
Task: Analyze the relationship between the number of pit stops a driver makes and their final position in the race.

Objective: Understand how the frequency of pit stops impacts race outcomes, particularly whether fewer or more stops lead to better results.on

In [None]:
# Number of Pit Stops vs Final Position

# Calculate the Total Number of Pit Stops for Each Driver
# Filter for valid pit stops (where PitInTime is not null)
pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
pit_stop_counts = (
    pit_stops.groupby(['EventName', 'Driver', 'Team', 'Position'])
    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
)

# Step 4: Plot the Relationship Between Total Pit Stops and Final Position
plt.figure(figsize=(14, 8))

# Scatter plot to show total pit stops vs final position
sns.scatterplot(data=pit_stop_counts, x='TotalPitStops', y='Position', hue='Driver', style='EventName', palette='tab20', s=100)

# Add labels and title
plt.title('Relationship Between Total Pit Stops and Final Position')
plt.xlabel('Total Pit Stops')
plt.ylabel('Final Position')
plt.gca().invert_yaxis()  # Lower positions (e.g., 1st) are better, so invert y-axis
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
# Number of Pit Stops vs Final Position

# Step 1: Calculate the Total Number of Pit Stops for Each Driver
# Filter for valid pit stops (where PitInTime is not null)
pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName, Driver, and Team to count the total number of pit stops
pit_stop_counts = (
    pit_stops.groupby(['EventName', 'Driver', 'Team'])
    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
)

# Step 2: Get the final position of each driver by EventName and Driver
final_positions = (
    lap_data.dropna(subset=['Position'])
    .groupby(['EventName', 'Driver'])
    .agg({'Position': 'last'})  # Get the final recorded position for each driver
    .reset_index()
)

# Step 3: Combine the pit stop counts with the final positions
pit_stop_counts = pit_stop_counts.merge(final_positions, on=['EventName', 'Driver'])

# Step 4: Plot the Relationship Between Total Pit Stops and Final Position
plt.figure(figsize=(14, 8))

# Scatter plot to show total pit stops vs final position
sns.scatterplot(data=pit_stop_counts, x='TotalPitStops', y='Position', hue='Driver', style='EventName', palette='tab20', s=100)

# Add labels and title
plt.title('Relationship Between Total Pit Stops and Final Position')
plt.xlabel('Total Pit Stops')
plt.ylabel('Final Position')
plt.gca().invert_yaxis()  # Lower positions (e.g., 1st) are better, so invert y-axis
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


Relationship 5
Race Length vs Strategy

Task: Examine how the total number of laps in each race influenced the pit stop strategy and tire choices for teams and drivers.

Objective: Understand how race length affects strategic decisions, particularly in terms of the number of stops and tire choices.

In [None]:
# Race Length vs Strategy

# Step 1: Calculate Total Laps per Event (Race)
total_laps_per_event = lap_data.groupby('EventName')['LapNumber'].max().reset_index(name='TotalLaps')

# Step 2: Analyze Pit Stop Strategy (Total Pit Stops per Driver per Event)
# Filter for valid pit stops (where PitInTime is not null)
pit_stops = lap_data.dropna(subset=['PitInTime'])

# Group by EventName and Driver to count the total number of pit stops
pit_stop_counts = (
    pit_stops.groupby(['EventName', 'Driver'])
    .size()  # size() counts the number of rows in each group, i.e., the number of pit stops
    .reset_index(name='TotalPitStops')  # Reset the index and name the count column
)

# Merge total laps with pit stop counts to see the relationship
pit_stop_laps = pit_stop_counts.merge(total_laps_per_event, on='EventName')

# Step 3: Analyze Tire Choices (Compound used per Event per Driver)
# Count the unique compounds used per event by driver
tire_choices = (
    lap_data.groupby(['EventName', 'Driver', 'Compound'])
    .size()
    .reset_index(name='LapCount')  # Number of laps on each tire compound
)

# Step 4: Plot the Relationship between Total Laps and Pit Stop Strategy
plt.figure(figsize=(14, 8))

# Scatter plot showing the relationship between total laps and pit stops
sns.scatterplot(data=pit_stop_laps, x='TotalLaps', y='TotalPitStops', hue='Driver', style='EventName', palette='tab20', s=100)

# Add labels and title
plt.title('Relationship Between Total Laps and Total Pit Stops')
plt.xlabel('Total Laps (per Event)')
plt.ylabel('Total Pit Stops (per Driver)')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Step 5: Plot Tire Compound Choices Across Events
plt.figure(figsize=(14, 8))

# Bar plot showing the distribution of tire choices across events and drivers
sns.barplot(data=tire_choices, x='EventName', y='LapCount', hue='Compound', palette='Set1')

# Add labels and title
plt.title('Tire Compound Choices Across Events')
plt.xlabel('Event Name')
plt.ylabel('Number of Laps on Each Tire Compound')
plt.xticks(rotation=45)
plt.legend(title='Tire Compound', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


In [None]:
# 5. Position_data
# Display the first few rows and summary info
position_data.head(), position_data.info(), position_data.describe(), position_data.describe(include='object')

In [None]:
# Categorical Analysis
sns.countplot(y='Source', data=position_data)
plt.show()

sns.countplot(y='Status', data=position_data)
plt.show()

# Distribution Analysis
position_data[['X', 'Y', 'Z']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Track Position Analysis

# Loop through each event
for event in position_data['EventName'].unique():
    event_data = position_data[position_data['EventName'] == event]

    # Plot positions for each driver
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='X', y='Y', hue='DriverName', data=event_data, alpha=0.9, palette='tab10')

    # Add title and labels
    plt.title(f'Track Positions for Drivers - {event}')
    plt.xlabel('X Coordinate')
    plt.ylabel('Y Coordinate')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
# Position Heatmaps

# Loop through each event
for event in position_data['EventName'].unique():
    event_data = position_data[position_data['EventName'] == event]

    # Create heatmap of positions
    plt.figure(figsize=(12, 8))
    plt.hexbin(event_data['X'], event_data['Y'], gridsize=50, cmap='Blues', alpha=0.9)
    
    # Add title and labels
    plt.title(f'Track Position Heatmap - {event}')
    plt.xlabel('X Coordinate')
    plt.ylabel('Y Coordinate')
    plt.colorbar(label='Frequency')
    plt.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
# Driver Position by Status
# Loop through each event
for event in position_data['EventName'].unique():
    event_data = position_data[position_data['EventName'] == event]

    # Plot positions by track status
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='X', y='Y', hue='Status', data=event_data, palette='viridis', alpha=0.6)

    # Add title and labels
    plt.title(f'Driver Positions by Track Status - {event}')
    plt.xlabel('X Coordinate')
    plt.ylabel('Y Coordinate')
    plt.legend(title='Track Status', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
# 6. Result_data
# Display the first few rows and summary info
result_data.head(), result_data.info(), result_data.describe(), result_data.describe(include='object')

In [None]:
# Categorical Analysis
sns.countplot(y='EventName', data=result_data)
plt.show()

sns.countplot(y='Points', data=result_data)
plt.show()

In [None]:
# Race Results Analysis

# Convert 'Time' to timedelta for better analysis
result_data['Time'] = pd.to_timedelta(result_data['Time'])

# Plot positions and points for each driver
plt.figure(figsize=(14, 8))
sns.barplot(x='Position', y='Points', hue='DriverId', data=result_data, palette='tab20', dodge=True)

# Add title and labels
plt.title('Driver Positions and Points - Race Results')
plt.xlabel('Position')
plt.ylabel('Points')
plt.legend(title='Driver')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Race Results Analysis

# Convert 'Time' to timedelta for better analysis
result_data['Time'] = pd.to_timedelta(result_data['Time'])

# Aggregate data to get average position for each driver
average_position_data = result_data.groupby('DriverId').agg(
    AveragePosition=('Position', 'mean'),
    TotalPoints=('Points', 'sum')  # keep total points if needed
).reset_index()

# Sort by Average Position in ascending order
average_position_data = average_position_data.sort_values(by='AveragePosition')

# Plot average positions for each driver
plt.figure(figsize=(14, 8))
sns.barplot(x='AveragePosition', y='DriverId', data=average_position_data, palette='tab20', alpha=0.9, edgecolor='black')

# Add title and labels
plt.title('Average Positions of Drivers Across Events')
plt.xlabel('Average Position')
plt.ylabel('Driver')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Race Results Analysis

# Convert 'Time' to timedelta for better analysis
result_data['Time'] = pd.to_timedelta(result_data['Time'])

# Aggregate data to get total points and last position for each driver
total_points_data = result_data.groupby('DriverId').agg(
    TotalPoints=('Points', 'sum'),
    FinalPosition=('Position', 'last')  # Assuming last position in the dataset is the final position
).reset_index()

# Sort by Total Points for better visualization
total_points_data = total_points_data.sort_values(by='TotalPoints', ascending=False)

# Plot total points and positions for each driver
plt.figure(figsize=(14, 8))

# Create a bar plot for total points
sns.barplot(x='TotalPoints', y='DriverId', data=total_points_data, palette='tab20', alpha=0.9, edgecolor='black')

# Add title and labels
plt.title('Total Points of Drivers')
plt.xlabel('Total Points')
plt.ylabel('Driver')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Race Results Analysis

# Convert 'Time' to timedelta for better analysis
result_data['Time'] = pd.to_timedelta(result_data['Time'])

# Get unique events
events = result_data['EventName'].unique()

# Loop through each event and create plots
for event in events:
    event_data = result_data[result_data['EventName'] == event]
    
    # Create a figure for the current event
    plt.figure(figsize=(14, 8))
    
    # Plot positions and points for each driver with enhanced visibility
    sns.barplot(x='Position', y='Points', hue='DriverId', data=event_data, palette='dark', dodge=True, alpha=0.95, errorbar=None, width=0.8, linewidth=1.5, edgecolor='black')
    
    # Add title and labels
    plt.title(f'Driver Positions and Points - {event}')
    plt.xlabel('Position')
    plt.ylabel('Points')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)
    
    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
# Driver Performance Comparison
# Plot grid positions vs. final positions
plt.figure(figsize=(12, 8))
sns.scatterplot(x='GridPosition', y='Position', hue='DriverId', data=result_data, palette='tab20', alpha=0.9, s=100)

# Add title and labels
plt.title('Grid Position vs Final Position - Race Results')
plt.xlabel('Grid Position')
plt.ylabel('Final Position')
plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Driver Performance Comparison Across Events
# Get unique events
events = result_data['EventName'].unique()

# Loop through each event and create a scatter plot
for event in events:
    event_data = result_data[result_data['EventName'] == event]

    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='GridPosition', y='Position', hue='DriverId', data=event_data, palette='tab20', 
                    alpha=0.9, s=100)  # Increase size and set alpha for better visibility

    # Add title and labels
    plt.title(f'Grid Position vs Final Position - {event}')
    plt.xlabel('Grid Position')
    plt.ylabel('Final Position')
    plt.legend(title='Driver', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True)

    # Show plot
    plt.tight_layout()
    plt.show()


In [None]:
# Race Times Analysis
# Plot race completion times
plt.figure(figsize=(12, 8))
sns.barplot(x='DriverId', y='Time', data=result_data, palette='tab10')

# Add title and labels
plt.title('Race Completion Times - Each Driver')
plt.xlabel('Driver')
plt.ylabel('Completion Time')
plt.xticks(rotation=90)
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Driver Points Distribution
# Plot points distribution
plt.figure(figsize=(12, 8))
sns.histplot(result_data['Points'], kde=True, bins=20, color='blue')

# Add title and labels
plt.title('Distribution of Points Across Drivers')
plt.xlabel('Points')
plt.ylabel('Frequency')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Driver Performance by Team
# Plot average positions and points by team
team_performance = result_data.groupby('TeamName').agg({'Position': 'mean', 'Points': 'mean'}).reset_index()

# Order teams by average position
team_performance = team_performance.sort_values(by='Position')

fig, ax1 = plt.subplots(figsize=(14, 8))

# Plot average positions on the left y-axis
color = 'skyblue'
ax1.set_xlabel('Team')
ax1.set_ylabel('Average Position', color=color)
ax1.bar(team_performance['TeamName'], team_performance['Position'], color=color, label='Average Position')
ax1.tick_params(axis='y', labelcolor=color)
ax1.invert_yaxis()  # Invert y-axis for position (lower is better)

# Create a second y-axis for average points
ax2 = ax1.twinx()
color = 'salmon'
ax2.set_ylabel('Average Points', color=color)
ax2.bar(team_performance['TeamName'], team_performance['Points'], color=color, alpha=0.6, label='Average Points')
ax2.tick_params(axis='y', labelcolor=color)

# Add titles and legends
plt.title('Driver Performance by Team')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.xticks(rotation=45)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Performance Metrics Correlation
# Drop irrelevant columns for correlation analysis
result_data_corr = result_data[['Position', 'GridPosition', 'Points']].dropna()

# Compute the correlation matrix
corr_matrix = result_data_corr.corr()

# Plot the heatmap of correlations
plt.figure(figsize=(10, 7))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Add title
plt.title('Correlation Heatmap of Performance Metrics')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# 7. Session_status_data
# Display the first few rows and summary info
session_status_data.head(), session_status_data.info(), session_status_data.describe(), session_status_data.describe(include='object')

In [None]:
# Categorical Analysis
sns.countplot(y='Status', data=session_status_data)
plt.show()

In [None]:
# Count the occurrences of each status
status_counts = session_status_data['Status'].value_counts()

# Plot the status frequencies
plt.figure(figsize=(10, 6))
sns.barplot(x=status_counts.index, y=status_counts.values, palette='Set2')
plt.title('Frequency of Session Statuses')
plt.xlabel('Status')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Timeline of Session Statuses

# Convert 'Time' to timedelta
session_status_data['Time'] = pd.to_timedelta(session_status_data['Time'])

# Plot status over time
plt.figure(figsize=(14, 6))
sns.lineplot(x='Time', y='Status', data=session_status_data, marker='o', palette='Set2')

# Add title and labels
plt.title('Timeline of Session Statuses')
plt.xlabel('Time')
plt.ylabel('Status')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Timeline of Session Statuses

# Convert 'Time' to timedelta
session_status_data['Time'] = pd.to_timedelta(session_status_data['Time'])

# Create a color map for different statuses
status_colors = {'Inactive': 'grey', 'Started': 'blue', 'Finished': 'green', 'Finalised': 'orange', 'Aborted':'red', 'Ends':'yellow'}

# Plot status over time using a scatter plot
plt.figure(figsize=(14, 6))
sns.scatterplot(x='Time', y='Status', data=session_status_data, hue='Status', palette=status_colors, s=100)

# Add title and labels
plt.title('Timeline of Session Statuses')
plt.xlabel('Time')
plt.ylabel('Status')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Timeline of Session Statuses

# Convert 'Time' to timedelta
session_status_data['Time'] = pd.to_timedelta(session_status_data['Time'])

# Identify unique statuses
unique_statuses = session_status_data['Status'].unique()

# Create a color map for different statuses (update as necessary)
status_colors = {status: color for status, color in zip(unique_statuses, sns.color_palette("Set2", len(unique_statuses)))}

# Plot status over time using a scatter plot
plt.figure(figsize=(14, 6))
sns.scatterplot(x='Time', y='Status', data=session_status_data, hue='Status', palette=status_colors, s=100)

# Add title and labels
plt.title('Timeline of Session Statuses')
plt.xlabel('Time')
plt.ylabel('Status')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Distribution of Session Status Durations
# Ensure 'Time' is sorted
session_status_data = session_status_data.sort_values(by='Time')

# Calculate duration for each status
session_status_data['NextTime'] = session_status_data['Time'].shift(-1)
session_status_data['Duration'] = (session_status_data['NextTime'] - session_status_data['Time']).dt.total_seconds()

# Drop last row where 'NextTime' is NaN
duration_data = session_status_data.dropna(subset=['Duration'])

# Plot durations of statuses
plt.figure(figsize=(12, 8))
sns.barplot(x='Status', y='Duration', data=duration_data, palette='Set1')

# Add title and labels
plt.title('Duration of Each Session Status')
plt.xlabel('Status')
plt.ylabel('Duration (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Session Status Frequency
# Count the frequency of each status
status_counts = session_status_data['Status'].value_counts()

# Plot status frequencies
plt.figure(figsize=(12, 8))
sns.barplot(x=status_counts.index, y=status_counts.values, palette='Set1')

# Add title and labels
plt.title('Frequency of Each Session Status')
plt.xlabel('Status')
plt.ylabel('Frequency')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Session Status Overview by Event
# Count the frequency of each status by event
event_status_counts = session_status_data.groupby('EventName')['Status'].value_counts().unstack().fillna(0)

# Plot status distributions for each event
event_status_counts.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='Set1')

# Add title and labels
plt.title('Session Status Distribution by Event')
plt.xlabel('Event')
plt.ylabel('Count')
plt.legend(title='Status')
plt.xticks(rotation=45)
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# 8. Track_status_data
# Display the first few rows and summary info
track_status_data.head(), track_status_data.info(), track_status_data.describe(), track_status_data.describe(include='object')

In [None]:
# Categorical Analysis
sns.countplot(y='Message', data=track_status_data)
plt.show()

# Distribution Analysis
track_status_data[['Status']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Timeline of Track Statuses

# Convert 'Time' to timedelta
track_status_data['Time'] = pd.to_timedelta(track_status_data['Time'])

# Plot track status over time
plt.figure(figsize=(14, 6))
sns.lineplot(x='Time', y='Status', data=track_status_data, marker='o', hue='Message', palette='Set2')

# Add title and labels
plt.title('Timeline of Track Statuses')
plt.xlabel('Time')
plt.ylabel('Track Status')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Track Status Frequency
# Count the frequency of each track status message
status_message_counts = track_status_data['Message'].value_counts()

# Plot status message frequencies
plt.figure(figsize=(12, 8))
sns.barplot(x=status_message_counts.index, y=status_message_counts.values, palette='Set1')

# Add title and labels
plt.title('Frequency of Track Status Messages')
plt.xlabel('Track Status Message')
plt.ylabel('Frequency')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Track Status Duration Analysis
# Ensure 'Time' is sorted
track_status_data = track_status_data.sort_values(by='Time')

# Calculate duration for each status
track_status_data['NextTime'] = track_status_data['Time'].shift(-1)
track_status_data['Duration'] = (track_status_data['NextTime'] - track_status_data['Time']).dt.total_seconds()

# Drop last row where 'NextTime' is NaN
duration_data = track_status_data.dropna(subset=['Duration'])

# Plot durations of statuses
plt.figure(figsize=(12, 8))
sns.barplot(x='Message', y='Duration', data=duration_data, palette='Set1')

# Add title and labels
plt.title('Duration of Each Track Status')
plt.xlabel('Track Status Message')
plt.ylabel('Duration (seconds)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Track Status Overview by Event
# Count the frequency of each track status by event
event_status_counts = track_status_data.groupby('EventName')['Message'].value_counts().unstack().fillna(0)

# Plot status distributions for each event
event_status_counts.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='Set1')

# Add title and labels
plt.title('Track Status Distribution by Event')
plt.xlabel('Event')
plt.ylabel('Count')
plt.legend(title='Track Status Message')
plt.xticks(rotation=45)
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# 9. Weather_data
# Display the first few rows and summary info
weather_data.head(), weather_data.info(), weather_data.describe(), weather_data.describe(include='object')

In [None]:
# Distribution Analysis
weather_data[['AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed']].hist(bins=50, figsize=(15, 6))
plt.show()

In [None]:
# Weather Trends Over Time

# Convert 'Time' to timedelta
weather_data['Time'] = pd.to_timedelta(weather_data['Time'])

# Plot weather parameters over time
plt.figure(figsize=(14, 8))
plt.plot(weather_data['Time'], weather_data['AirTemp'], label='Air Temperature (°C)', color='blue')
plt.plot(weather_data['Time'], weather_data['TrackTemp'], label='Track Temperature (°C)', color='red')
plt.plot(weather_data['Time'], weather_data['Humidity'], label='Humidity (%)', color='green')
plt.plot(weather_data['Time'], weather_data['WindSpeed'], label='Wind Speed (m/s)', color='orange')

# Add title and labels
plt.title('Weather Trends Over Time')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Rainfall Impact on Track Conditions
# Plot track temperature with respect to rainfall
plt.figure(figsize=(12, 8))
sns.boxplot(x='Rainfall', y='TrackTemp', data=weather_data, palette='Set1')

# Add title and labels
plt.title('Track Temperature with and without Rainfall')
plt.xlabel('Rainfall')
plt.ylabel('Track Temperature (°C)')
plt.grid(True)

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Correlation Between Weather Parameters

# Calculate the correlation matrix
correlation_matrix = weather_data[['AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindSpeed']].corr()

# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Add title
plt.title('Correlation Heatmap of Weather Parameters')

# Show plot
plt.tight_layout()
plt.show()


In [None]:
# Weather Conditions by Event

# Calculate average weather parameters for each event (excluding wind direction)
avg_weather_by_event = weather_data.groupby('EventName').mean().reset_index()

# Plot separate subplots for each weather parameter
fig, axes = plt.subplots(2, 2, figsize=(14, 10))  # 2x2 grid of subplots

# Air Temperature
avg_weather_by_event.plot(x='EventName', y='AirTemp', kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Average Air Temperature by Event')
axes[0, 0].set_xlabel('Event')
axes[0, 0].set_ylabel('Air Temp (°C)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Track Temperature
avg_weather_by_event.plot(x='EventName', y='TrackTemp', kind='bar', ax=axes[0, 1], color='salmon')
axes[0, 1].set_title('Average Track Temperature by Event')
axes[0, 1].set_xlabel('Event')
axes[0, 1].set_ylabel('Track Temp (°C)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Humidity
avg_weather_by_event.plot(x='EventName', y='Humidity', kind='bar', ax=axes[1, 0], color='lightgreen')
axes[1, 0].set_title('Average Humidity by Event')
axes[1, 0].set_xlabel('Event')
axes[1, 0].set_ylabel('Humidity (%)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Wind Speed
avg_weather_by_event.plot(x='EventName', y='WindSpeed', kind='bar', ax=axes[1, 1], color='gold')
axes[1, 1].set_title('Average Wind Speed by Event')
axes[1, 1].set_xlabel('Event')
axes[1, 1].set_ylabel('Wind Speed (km/h)')
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# The End 