In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import json

In [2]:
def load_and_combine_data(years_range=(2018, 2024)):
    """Load and combine F1 data from multiple years"""
    data_dir = Path('./data')
    dfs = []
    
    for year in range(years_range[0], years_range[1] + 1):
        try:
            filepath = data_dir / f'f1_data_{year}.csv'
            if filepath.exists():
                df = pd.read_csv(filepath)
                dfs.append(df)
                print(f"Loaded data for {year}")
        except Exception as e:
            print(f"Error loading data for {year}: {e}")
            continue
    
    if not dfs:
        raise ValueError("No data files found")
        
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"\nCombined dataset shape: {combined_df.shape}")
    return combined_df

In [3]:
def process_for_prediction(df):
    """Process the combined data for position improvement"""
   
    # Print initial data shape
    print(f"\nInitial data shape: {df.shape}")
   
    # Create a clean copy of the data
    df_clean = df.copy()
   
    # Remove rows with any null values
    df_clean = df_clean.dropna()

    # Process PitStopLaps to ensure it matches number of pit stops
    def process_pit_stop_laps(num_pit_stops, pit_stop_laps):
        # Convert inputs to appropriate type
        num_pit_stops = int(num_pit_stops)
        
        # Convert pit_stop_laps to list if it's a string
        if isinstance(pit_stop_laps, str):
            pit_stop_laps = eval(pit_stop_laps)
        
        # Ensure the number of pit stop laps matches the number of pit stops
        if len(pit_stop_laps) != num_pit_stops:
            print(f"Warning: Pit stops ({num_pit_stops}) and pit stop laps ({len(pit_stop_laps)}) do not match")
        
        return pit_stop_laps[:num_pit_stops]

    # Process TyreCompounds to ensure it matches number of pit stops + 1 (starting tyre)
    def process_tyre_compounds_with_pit_stops(num_pit_stops, compounds_list):
        # Convert inputs to appropriate type
        num_pit_stops = int(num_pit_stops)
        
        # Convert compounds to list if it's a string
        if isinstance(compounds_list, str):
            compounds_list = eval(compounds_list)
        
        # Remove 'UNKNOWN' compounds
        cleaned_compounds = [comp for comp in compounds_list if comp != 'UNKNOWN']
        
        # Ensure the number of compounds is number of pit stops + 1 (starting tyre)
        if len(cleaned_compounds) != num_pit_stops + 1:
            print(f"Warning: Pit stops + 1 ({num_pit_stops + 1}) and tyre compounds ({len(cleaned_compounds)}) do not match")
        
        return cleaned_compounds[:num_pit_stops + 1]

    # Apply processing to PitStopLaps and TyreCompounds
    df_clean['PitStopLaps'] = df_clean.apply(
        lambda row: process_pit_stop_laps(row['NumPitStops'], row['PitStopLaps']), 
        axis=1
    )
    
    df_clean['TyreCompounds'] = df_clean.apply(
        lambda row: process_tyre_compounds_with_pit_stops(row['NumPitStops'], row['TyreCompounds']), 
        axis=1
    )
   
    # Convert Q1Time to float if it's not already
    df_clean['Q1Time'] = df_clean['Q1Time'].astype(float)
   
    # Convert Laps to integer
    df_clean['Laps'] = df_clean['Laps'].astype(int)
   
    # Create mappings for categorical variables
    mappings = {}
   
    # Handle categorical variables including Season
    categorical_columns = ['Season', 'Circuit', 'Team', 'Driver']
    for col in categorical_columns:
        # Filter out any None or unknown values
        valid_values = df_clean[col].dropna().unique()
        
        le = LabelEncoder()
        filtered_labels = [label for label in valid_values]
        le.fit(filtered_labels)
        
        df_clean.loc[:, f'{col}_encoded'] = le.transform(df_clean[col])
        
        # Convert numpy types to native Python types for JSON serialization
        mapping = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}
        mappings[col] = mapping
    
    # Create tyre compound mapping
    def create_tyre_mapping(all_compounds):
        # Flatten and get unique tyre compounds
        unique_compounds = sorted(set([compound for sublist in all_compounds for compound in sublist]))
        return {compound: idx for idx, compound in enumerate(unique_compounds)}
    
    tyre_mapping = create_tyre_mapping(df_clean['TyreCompounds'])
    mappings['TyreCompounds'] = tyre_mapping
    
    # Convert TyreCompounds to encoded lists
    def encode_tyre_compounds(compounds):
        return [tyre_mapping[compound] for compound in compounds]
    
    df_clean.loc[:, 'TyreCompounds_encoded'] = df_clean['TyreCompounds'].apply(encode_tyre_compounds)
    
    # Select features for prediction
    feature_columns = [
        'Season_encoded', 'Round', 'Circuit_encoded',
        'Laps', 'NumParticipants', 'AirTemp',
        'Humidity', 'Pressure','TrackTemp',
        'WindDirection', 'WindSpeed', 'Team_encoded',
        'Driver_encoded', 'Q1Time', 'GridPosition',
        'NumPitStops', 'PitStopLaps', 'TyreCompounds_encoded'  # Encoded tyre compounds
    ]

    # Split features into numerical and categorical
    numerical_features = [
        'Round', 'GridPosition',
        'NumParticipants', 'NumPitStops',
        'AirTemp', 'Humidity', 'Pressure',
        'TrackTemp', 'WindDirection', 'WindSpeed', 'Q1Time', 'Laps'
    ]
   
    categorical_features = [
        'Season_encoded', 'Circuit_encoded', 'Team_encoded', 'Driver_encoded',
        'TyreCompounds_encoded', 'PitStopLaps'  # Encoded lists as categorical features
    ]
   
    # Create target variables
    target_columns = ['ClassificationResult']
    df_clean['ClassificationResult'] = -1 * df_clean['ClassificationResult']
    
    # Create the feature matrix and target variables
    X = df_clean[feature_columns].copy()
    y = df_clean[target_columns].copy()
   
    return X, y, numerical_features, categorical_features, target_columns, mappings

In [4]:
def save_mappings(mappings, year_range):
    """Save the feature mappings to JSON files"""
    # Create mappings directory
    mappings_dir = Path('./data/mappings')
    mappings_dir.mkdir(parents=True, exist_ok=True)
   
    # Save each mapping to a separate file
    for feature, mapping in mappings.items():
        filename = mappings_dir / f'{feature}_mapping_{year_range[0]}_{year_range[1]}.json'
        print(f"Saving mapping for {feature} with {len(mapping)} items")
        with open(filename, 'w') as f:
            json.dump(mapping, f, indent=4, sort_keys=True)
        print(f"Saved mapping for {feature} to {filename}")

def analyze_data(X, y, numerical_features, categorical_features, target_columns):
    """Analyze the processed data"""
    print("\nFeature Statistics:")
    print("------------------")
    print(f"Number of numerical features: {len(numerical_features)}")
    print(f"Number of categorical features: {len(categorical_features)}")
   
    print("\nNumerical Features:")
    for col in numerical_features:
        print(f"- {col}")
        print(f"  Mean: {X[col].mean():.2f}")
        print(f"  Std: {X[col].std():.2f}")
        print(f"  Min: {X[col].min():.2f}")
        print(f"  Max: {X[col].max():.2f}")
   
    print("\nCategorical Features:")
    for col in categorical_features:
        print(f"- {col}")
        if col in ['TyreCompounds_encoded', 'PitStopLaps']:
            # Special handling for list-type features
            print(f"  Unique compound/lap combinations: {len(set(tuple(x) for x in X[col]))}")
            print(f"  Average list length: {X[col].apply(len).mean():.2f}")
        else:
            print(f"  Unique values: {X[col].nunique()}")
   
    print("\nTarget Statistics:")
    print("-----------------")
    for col in target_columns:
        print(f"{col}:")
        print(f"Mean: {y[col].mean():.2f}")
        print(f"Std: {y[col].std():.2f}")
        print(f"Min: {y[col].min():.2f}")
        print(f"Max: {y[col].max():.2f}")
   
    # Save analysis
    with open(Path('./data/prediction_data_analysis.txt'), 'w') as f:
        f.write("F1 Race Prediction Data Analysis\n")
        f.write("===============================\n\n")
        f.write(f"Total samples: {len(X)}\n")
        f.write(f"Numerical features: {len(numerical_features)}\n")
        f.write(f"Categorical features: {len(categorical_features)}\n\n")
       
        f.write("Numerical Features:\n")
        for col in numerical_features:
            f.write(f"- {col}\n")
            f.write(f"  Mean: {X[col].mean():.2f}\n")
            f.write(f"  Std: {X[col].std():.2f}\n")
            f.write(f"  Min: {X[col].min():.2f}\n")
            f.write(f"  Max: {X[col].max():.2f}\n")
       
        f.write("\nCategorical Features:\n")
        for col in categorical_features:
            f.write(f"- {col}\n")
            if col in ['TyreCompounds_encoded', 'PitStopLaps']:
                # Special handling for list-type features
                f.write(f"  Unique compound/lap combinations: {len(set(tuple(x) for x in X[col]))}\n")
                f.write(f"  Average list length: {X[col].apply(len).mean():.2f}\n")
            else:
                f.write(f"  Unique values: {X[col].nunique()}\n")
       
        f.write("\nTarget Variables:\n")
        for col in target_columns:
            f.write(f"{col}:\n")
            f.write(f"Mean: {y[col].mean():.2f}\n")
            f.write(f"Std: {y[col].std():.2f}\n")
            f.write(f"Min: {y[col].min():.2f}\n")
            f.write(f"Max: {y[col].max():.2f}\n")

In [5]:
year_range = (2018, 2024)

# Load and combine data
print("Loading and combining data...")
combined_df = load_and_combine_data(year_range)

# Process data for prediction
print("\nProcessing data for prediction...")
X, y, numerical_features, categorical_features, target_columns, mappings = process_for_prediction(combined_df)

# Save feature mappings
print("\nSaving feature mappings...")
save_mappings(mappings, year_range)

# Analyze processed data
print("\nAnalyzing processed data...")
analyze_data(X, y, numerical_features, categorical_features, target_columns)

# Save processed data
print("\nSaving processed data...")
data_dir = Path('./data')
X.to_csv(data_dir / 'processed_features.csv', index=False)
y.to_csv(data_dir / 'processed_targets.csv', index=False)

# Save feature lists for later use
feature_info = {
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'target_columns': target_columns
}
import json
with open(data_dir / 'feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=4)

print("\nData processing complete!")

Loading and combining data...
Loaded data for 2018
Loaded data for 2019
Loaded data for 2020
Loaded data for 2021
Loaded data for 2022
Loaded data for 2023
Loaded data for 2024

Combined dataset shape: (2424, 22)

Processing data for prediction...

Initial data shape: (2424, 22)

Saving feature mappings...
Saving mapping for Season with 7 items
Saved mapping for Season to data\mappings\Season_mapping_2018_2024.json
Saving mapping for Circuit with 34 items
Saved mapping for Circuit to data\mappings\Circuit_mapping_2018_2024.json
Saving mapping for Team with 18 items
Saved mapping for Team to data\mappings\Team_mapping_2018_2024.json
Saving mapping for Driver with 40 items
Saved mapping for Driver to data\mappings\Driver_mapping_2018_2024.json
Saving mapping for TyreCompounds with 8 items
Saved mapping for TyreCompounds to data\mappings\TyreCompounds_mapping_2018_2024.json

Analyzing processed data...

Feature Statistics:
------------------
Number of numerical features: 12
Number of cate