# 02 Feature Engineering - Extract telemetry features

Goal: Build a pipeline that turns raw lap data into features I can feed into the Bayesian model.

Pipeline flow:
1. Single lap â†’ extract telemetry features (speed, throttle, braking, etc.)
2. Driver session â†’ aggregate all their laps
3. Full session â†’ calculate relative performance (who's fastest?)
4. Export â†’ ready for predictions

This will move to `src/` later as production code.

In [1]:
import fastf1
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import warnings

warnings.filterwarnings('ignore')

import logging
logging.getLogger("fastf1").setLevel(logging.ERROR)

cache_dir = Path('../data/raw/.fastf1_cache')
fastf1.Cache.enable_cache(str(cache_dir))

print("Feature Engineering Pipeline")

Feature Engineering Pipeline


## Part 1: Single lap features

Core building block - extract features from one lap of telemetry.

In [2]:
class LapFeatureExtractor:
    """Extract telemetry features from a single F1 lap."""
    
    def __init__(self, corner_speed_thresholds=None):
        """
        Corner speed thresholds for classification.
        Default: slow <100, medium 100-200, high 200-250 km/h
        """
        if corner_speed_thresholds is None:
            self.corner_thresholds = {
                'slow': (0, 100),
                'medium': (100, 200),
                'high': (200, 250)
            }
        else:
            self.corner_thresholds = corner_speed_thresholds
    
    def extract_corner_speeds(self, telemetry):
        """Average speed in slow/medium/high-speed corners."""
        # Corners are anywhere under 250 km/h (arbitrary but works)
        corners = telemetry[telemetry['Speed'] < 250]
        
        speeds = {}
        for corner_type, (min_speed, max_speed) in self.corner_thresholds.items():
            mask = (corners['Speed'] >= min_speed) & (corners['Speed'] < max_speed)
            corner_data = corners[mask]
            
            if len(corner_data) > 0:
                speeds[f'{corner_type}_corner_speed'] = corner_data['Speed'].mean()
            else:
                speeds[f'{corner_type}_corner_speed'] = np.nan
        
        return speeds
    
    def extract_throttle_metrics(self, telemetry):
        """Throttle usage - percentage at full throttle, average, smoothness."""
        throttle = telemetry['Throttle']
        
        return {
            'pct_full_throttle': (throttle == 100).sum() / len(throttle) * 100,
            'avg_throttle': throttle.mean(),
            'throttle_smoothness': throttle.std()  # lower = smoother
        }
    
    def extract_braking_metrics(self, telemetry):
        """Braking zones and intensity."""
        brake = telemetry['Brake']
        
        # Count braking zones (transitions from 0 to >0)
        braking_points = ((brake > 0) & (brake.shift(1) == 0)).sum()
        
        return {
            'braking_pct': (brake > 0).sum() / len(brake) * 100,
            'braking_zones': braking_points,
            'avg_brake_intensity': brake[brake > 0].mean() if (brake > 0).any() else 0
        }
    
    def extract_straight_line_speed(self, telemetry):
        """Top speed and speed at full throttle."""
        full_throttle = telemetry[telemetry['Throttle'] == 100]
        
        # Max gear (usually 8th) indicates straight-line running
        max_gear = telemetry['nGear'].max()
        top_gear = telemetry[telemetry['nGear'] == max_gear]
        
        return {
            'avg_speed_full_throttle': full_throttle['Speed'].mean() if len(full_throttle) > 0 else np.nan,
            'max_speed': telemetry['Speed'].max(),
            'pct_at_max_gear': len(top_gear) / len(telemetry) * 100
        }
    
    def extract_drs_usage(self, telemetry):
        """How much DRS was available and used."""
        drs = telemetry['DRS']
        return {'drs_active_pct': (drs > 0).sum() / len(drs) * 100}
    
    def extract_features(self, lap) -> Dict[str, float]:
        """
        Extract all features from a lap.
        Returns dict of feature_name -> value.
        """
        try:
            telemetry = lap.get_telemetry()
            
            if telemetry is None or len(telemetry) == 0:
                return {}
            
            # Combine all feature extractors
            features = {}
            features.update(self.extract_corner_speeds(telemetry))
            features.update(self.extract_throttle_metrics(telemetry))
            features.update(self.extract_braking_metrics(telemetry))
            features.update(self.extract_straight_line_speed(telemetry))
            features.update(self.extract_drs_usage(telemetry))
            
            return features
            
        except Exception as e:
            # Sometimes telemetry fails to load
            return {}


# Quick test on a real lap
print("\nTest: Extract features from Verstappen's lap in 2024 testing")

session = fastf1.get_session(2024, 'Testing', 1)
session.load()

ver_laps = session.laps.pick_drivers('VER')
lap = ver_laps.iloc[len(ver_laps) // 2] if len(ver_laps) > 0 else ver_laps.iloc[0]

extractor = LapFeatureExtractor()
features = extractor.extract_features(lap)

print(f"Lap {lap['LapNumber']}: {lap['LapTime']}")
print(f"\nExtracted {len(features)} features:")
for k, v in list(features.items())[:5]:
    print(f"  {k}: {v:.1f}")

print("\nðŸŸ¢ Single lap extraction works")


Test: Extract features from Verstappen's lap in 2024 testing
Lap 12.0: 0 days 00:01:33.991000

Extracted 13 features:
  slow_corner_speed: 91.3
  medium_corner_speed: 147.0
  high_corner_speed: 225.4
  pct_full_throttle: 57.4
  avg_throttle: 69.7

ðŸŸ¢ Single lap extraction works


## Part 2: Aggregate driver session

One driver does ~60 laps in a session. I need to aggregate them into representative features.

In [3]:
class SessionFeatureAggregator:
    """Aggregate lap-level features into session-level features for a driver."""
    
    def __init__(self, lap_extractor):
        self.lap_extractor = lap_extractor
    
    def filter_clean_laps(self, laps):
        """
        Remove outliers and invalid laps.
        Keep laps where: in-lap, out-lap, yellow flags, accidents filtered out.
        """
        # Basic filters
        clean = laps[
            (laps['IsAccurate'] == True) &
            (laps['TrackStatus'] == '1')  # Green flag
        ].copy()
        
        # Remove statistical outliers (more than 3 std from median)
        if len(clean) > 5:
            lap_times = clean['LapTime'].dt.total_seconds()
            median = lap_times.median()
            std = lap_times.std()
            
            clean = clean[abs(lap_times - median) < 3 * std]
        
        return clean
    
    def extract_driver_session(self, laps) -> Dict[str, float]:
        """
        Extract features for one driver's session.
        Returns aggregated features (median across clean laps).
        """
        if len(laps) == 0:
            return {}
        
        # Basic info
        driver_info = {
            'driver_number': str(laps.iloc[0]['DriverNumber']),
            'driver_code': laps.iloc[0]['Driver'],
            'team': laps.iloc[0]['Team'],
            'total_laps': len(laps)
        }
        
        # Filter to clean laps only
        clean_laps = self.filter_clean_laps(laps)
        driver_info['clean_laps'] = len(clean_laps)
        
        if len(clean_laps) == 0:
            return driver_info
        
        # Fastest lap (key metric for practice sessions)
        fastest = clean_laps.pick_fastest()
        driver_info['fastest_lap'] = fastest['LapTime'].total_seconds()
        
        # Extract features from all clean laps
        lap_features = []
        for idx, lap in clean_laps.iterrows():
            features = self.lap_extractor.extract_features(lap)
            if features:  # Skip if telemetry failed
                lap_features.append(features)
        
        if len(lap_features) == 0:
            return driver_info
        
        # Aggregate: median across all laps (robust to outliers)
        df = pd.DataFrame(lap_features)
        aggregated = df.median().to_dict()
        
        # Also track consistency (std)
        for col in df.columns:
            aggregated[f'{col}_std'] = df[col].std()
        
        # Merge with driver info
        return {**driver_info, **aggregated}
    
    def extract_all_drivers(self, session) -> pd.DataFrame:
        """Extract features for all drivers in a session."""
        driver_features = []
        
        for driver in session.laps['Driver'].unique():
            driver_laps = session.laps.pick_drivers(driver)
            features = self.extract_driver_session(driver_laps)
            
            if features and 'fastest_lap' in features:
                driver_features.append(features)
        
        return pd.DataFrame(driver_features)


# Test on full session
print("\nTest: Extract features for all drivers in FP1")

aggregator = SessionFeatureAggregator(extractor)
session_features = aggregator.extract_all_drivers(session)

print(f"\nExtracted features for {len(session_features)} drivers")
print(f"Features per driver: {len(session_features.columns)}")
print(f"\nFastest 3:")
print(session_features.nsmallest(3, 'fastest_lap')[['driver_code', 'team', 'fastest_lap', 'clean_laps']])

print("\nðŸŸ¢ Session aggregation works")


Test: Extract features for all drivers in FP1

Extracted features for 20 drivers
Features per driver: 32

Fastest 3:
   driver_code             team  fastest_lap  clean_laps
16         SAI          Ferrari       93.602          14
5          LEC          Ferrari       93.623          16
0          VER  Red Bull Racing       93.855          11

ðŸŸ¢ Session aggregation works


## Part 3: Relative performance

Absolute lap times don't mean much (depends on track, conditions, etc.).
I need relative performance - how fast is this driver compared to the field?

In [4]:
class RelativePerformanceCalculator:
    """Convert absolute features to relative performance vs field."""
    
    def __init__(self, use_median=True):
        """
        use_median: If True, normalize to median (robust to outliers).
                   If False, normalize to mean.
        """
        self.use_median = use_median
    
    def normalize_features(self, features_df):
        """
        Add relative features: difference from field median/mean.
        Prefix: 'fastest_lap_rel', 'avg_throttle_rel', etc.
        """
        df = features_df.copy()
        
        # Identify numeric columns (skip metadata like driver_code)
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if df[col].notna().sum() < 2:
                continue  # Skip if not enough data
            
            if self.use_median:
                baseline = df[col].median()
            else:
                baseline = df[col].mean()
            
            df[f'{col}_rel'] = df[col] - baseline
        
        return df
    
    def add_percentile_ranks(self, features_df):
        """
        Add percentile ranks for key features.
        Example: fastest_lap_pct = 95 means faster than 95% of field.
        """
        df = features_df.copy()
        
        # Lower is better for lap times
        if 'fastest_lap' in df.columns:
            df['fastest_lap_pct'] = df['fastest_lap'].rank(pct=True, ascending=True) * 100
        
        # Higher is better for speed metrics
        speed_cols = [col for col in df.columns if 'speed' in col.lower() and '_rel' not in col]
        for col in speed_cols:
            if col in df.columns:
                df[f'{col}_pct'] = df[col].rank(pct=True, ascending=False) * 100
        
        return df


# Test relative performance
print("\nTest: Calculate relative performance")

rel_calc = RelativePerformanceCalculator(use_median=True)
normalized = rel_calc.normalize_features(session_features)
with_ranks = rel_calc.add_percentile_ranks(normalized)

print(f"\nAdded relative features:")
rel_cols = [col for col in with_ranks.columns if '_rel' in col]
print(f"  {len(rel_cols)} relative columns")

print(f"\nTop 3 by fastest lap percentile:")
print(with_ranks.nlargest(3, 'fastest_lap_pct')[['driver_code', 'fastest_lap', 'fastest_lap_rel', 'fastest_lap_pct']])

print("\nðŸŸ¢ Relative performance works")


Test: Calculate relative performance

Added relative features:
  29 relative columns

Top 3 by fastest lap percentile:
   driver_code  fastest_lap  fastest_lap_rel  fastest_lap_pct
10         ZHO       97.219           2.8805            100.0
14         COL       95.248           0.9095             95.0
18         BOT       95.041           0.7025             90.0

ðŸŸ¢ Relative performance works


## Part 4: Production pipeline

Put it all together in one clean pipeline class.

In [5]:
class F1FeaturePipeline:
    """
    Complete feature extraction pipeline.
    
    Usage:
        pipeline = F1FeaturePipeline()
        features = pipeline.process_session(session)
    """
    
    def __init__(self):
        self.lap_extractor = LapFeatureExtractor()
        self.session_aggregator = SessionFeatureAggregator(self.lap_extractor)
        self.rel_calculator = RelativePerformanceCalculator(use_median=True)
    
    def process_session(self, session, add_metadata=True):
        """
        Complete pipeline: Session â†’ Features with relative performance.
        
        Returns DataFrame with one row per driver.
        """
        # Step 1: Extract raw features
        features = self.session_aggregator.extract_all_drivers(session)
        
        if len(features) == 0:
            return pd.DataFrame()
        
        # Step 2: Calculate relative performance
        normalized = self.rel_calculator.normalize_features(features)
        with_ranks = self.rel_calculator.add_percentile_ranks(normalized)
        
        # Step 3: Add metadata
        if add_metadata:
            with_ranks['year'] = session.event['EventDate'].year
            with_ranks['event'] = session.event['EventName']
            with_ranks['session_type'] = session.name
            with_ranks['session_date'] = session.date
        
        return with_ranks
    
    def process_multiple_sessions(self, sessions, verbose=True):
        """Process multiple sessions and combine."""
        all_features = []
        
        for i, session in enumerate(sessions):
            if verbose:
                print(f"Processing {i+1}/{len(sessions)}: {session.event['EventName']} - {session.name}")
            
            features = self.process_session(session)
            if len(features) > 0:
                all_features.append(features)
        
        if len(all_features) == 0:
            return pd.DataFrame()
        
        combined = pd.concat(all_features, ignore_index=True)
        
        if verbose:
            print(f"\nðŸŸ¢ Processed {len(all_features)} sessions")
            print(f"  {len(combined)} total rows, {combined['driver_number'].nunique()} drivers")
        
        return combined


# Test on 2024 testing (3 days)
print("\nTest: Process all 2024 testing sessions")

pipeline = F1FeaturePipeline()

testing_sessions = []
for day in range(1, 4):
    s = fastf1.get_session(2024, 'Testing', day)
    s.load()
    testing_sessions.append(s)

all_features = pipeline.process_multiple_sessions(testing_sessions)

print(f"\nDataset shape: {all_features.shape}")
print(f"Feature completeness: {all_features.notna().mean().mean() * 100:.1f}%")

print("\nðŸŸ¢ Production pipeline works")


Test: Process all 2024 testing sessions
Processing 1/3: United States Grand Prix - Practice 1
Processing 2/3: United States Grand Prix - Sprint Qualifying
Processing 3/3: United States Grand Prix - Sprint

ðŸŸ¢ Processed 3 sessions
  60 total rows, 20 drivers

Dataset shape: (60, 76)
Feature completeness: 100.0%

ðŸŸ¢ Production pipeline works


## Export features

Save to parquet for fast loading in Bayesian validation notebook.

In [6]:
output_dir = Path('../data/processed/testing_files')
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / '2024_testing_features.parquet'
all_features.to_parquet(output_file, index=False)

print(f"ðŸŸ¢ Saved to {output_file}")
print(f"  Shape: {all_features.shape}")
print(f"  Size: {output_file.stat().st_size / 1024:.1f} KB")

ðŸŸ¢ Saved to ../data/processed/testing_files/2024_testing_features.parquet
  Shape: (60, 76)
  Size: 73.6 KB
