# Step 1: Data Preparation

**Objective:**  
Export historic race session data from FastF1 API for use as the streaming source.

**Instructions:**
- Extract relevant fields (Speed, LapNumber, Timestamp, DriverID, Compound, etc.) from a FastF1 session
- Save as CSV, Parquet, or JSON file
- Validate: File loads without errors, data columns are as expected
- Calculate data frequency from timestamps


In [1]:
# Import required libraries
import sys
import os
import importlib
import pandas as pd
import numpy as np
from datetime import datetime

# Add project root directory to path
# Get current working directory
cwd = os.getcwd()
# If we're in notebooks/, go up one level to get project root
if os.path.basename(cwd) == 'notebooks':
    project_root = os.path.dirname(cwd)
else:
    # Otherwise, assume we're already in project root or need to navigate
    project_root = cwd
    # If src doesn't exist here, try going up from notebooks
    if not os.path.exists(os.path.join(project_root, 'src')):
        # Try to find project root by looking for src directory
        current = cwd
        while current != os.path.dirname(current):
            if os.path.exists(os.path.join(current, 'src')):
                project_root = current
                break
            current = os.path.dirname(current)

# Add project root to Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import our modules
from src.data_preparation import (
    load_fastf1_session,
    extract_telemetry_fields,
    export_to_csv,
    export_to_parquet,
    prepare_data
)
from src.utils import calculate_frequency, validate_data, get_telemetry_fields

# Reload modules to pick up any changes (useful during development)
import src.utils
import src.data_preparation
importlib.reload(src.utils)
importlib.reload(src.data_preparation)

# Re-import after reload
from src.utils import calculate_frequency, validate_data, get_telemetry_fields
from src.data_preparation import (
    load_fastf1_session,
    extract_telemetry_fields,
    export_to_csv,
    export_to_parquet,
    prepare_data
)

print("✅ Imports successful")


✅ Imports successful




In [2]:
# Load configuration
import yaml

config_path = "../config/config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

race_config = config['race']
print(f"Race Configuration:")
print(f"  Year: {race_config['year']}")
print(f"  Grand Prix: {race_config['grand_prix']}")
print(f"  Session: {race_config['session']}")


Race Configuration:
  Year: 2023
  Grand Prix: Monaco
  Session: Race


In [3]:
# Load FastF1 session data
year = race_config['year']
grand_prix = race_config['grand_prix']
session = race_config['session']

print(f"Loading {year} {grand_prix} {session}...")
session_obj = load_fastf1_session(year, grand_prix, session, cache_dir="../../OLD_version/f1_cache")

print(f"✅ Session loaded successfully")
print(f"Drivers: {session_obj.results['Abbreviation'].tolist()}")


INFO:src.data_preparation:Loading 2023 Monaco Race...


Loading 2023 Monaco Race...


core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
INFO:fastf1.fastf1.core:Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
DEBUG:fastf1.ergast:Failed to parse timestamp '-1:53:44.819' in Ergastresponse.
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
INFO:fastf1.fastf1.req:Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_d

✅ Session loaded successfully
Drivers: ['VER', 'ALO', 'OCO', 'HAM', 'RUS', 'LEC', 'GAS', 'SAI', 'NOR', 'PIA', 'BOT', 'DEV', 'ZHO', 'ALB', 'TSU', 'PER', 'HUL', 'SAR', 'MAG', 'STR']


In [4]:
# Extract telemetry fields
print("Extracting telemetry fields...")
df = extract_telemetry_fields(session_obj)

print(f"\n✅ Data extraction complete")
print(f"Total records: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


INFO:src.data_preparation:Extracting telemetry for 20 drivers


Extracting telemetry fields...


  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
INFO:src.data_preparation:Extracted 49781 data points for driver VER
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, field] = value
  telemetry.loc[lap_mask, fie


✅ Data extraction complete
Total records: 982330
Columns: ['Date', 'SessionTime', 'DriverAhead', 'DistanceToDriverAhead', 'Time', 'RPM', 'Speed', 'nGear', 'Throttle', 'Brake', 'DRS', 'Source', 'Distance', 'RelativeDistance', 'Status_x', 'X', 'Y', 'Z', 'DriverID', 'Driver', 'LapNumber', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'LapStartTime', 'LapStartDate', 'PitOutTime', 'PitInTime', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Stint', 'TrackStatus', 'Position_x', 'Deleted', 'DeletedReason', 'LapTime_seconds', 'Sector2Time_seconds', 'Sector3Time_seconds', 'Sector2SessionTime_seconds', 'Sector3SessionTime_seconds', 'LapStartTime_seconds', 'Time_seconds', 'Sector1Time_seconds', 'Sector1SessionTime_seconds', 'PitInTime_seconds', 'PitOutTime_seconds', 'SessionTime_seconds', 'DriverNumber', 'FullName', 'FirstName', 'LastName', 'TeamName', 'GridPosition', 'Position_y', 'Points', 'Status_y', 'Q1', 'Q2

Unnamed: 0,Date,SessionTime,DriverAhead,DistanceToDriverAhead,Time,RPM,Speed,nGear,Throttle,Brake,...,FirstName,LastName,TeamName,GridPosition,Position_y,Points,Status_y,Q1,Q2,Q3
0,2023-05-28 13:03:03.927,0 days 01:02:02.950000,,0.066667,0 days 01:03:27.435000,10058.716787,0.0,1,16.0,True,...,Max,Verstappen,Red Bull Racing,1.0,1.0,25.0,Finished,NaT,NaT,NaT
1,2023-05-28 13:03:03.927,0 days 01:02:02.950000,,0.0,0 days 01:03:28.724000,9551.783309,0.0,1,25.0,False,...,Fernando,Alonso,Aston Martin,2.0,2.0,18.0,Finished,NaT,NaT,NaT
2,2023-05-28 13:03:03.927,0 days 01:02:02.950000,,0.0,0 days 01:03:32.305000,10395.508211,0.0,1,38.0,True,...,Charles,Leclerc,Ferrari,6.0,6.0,8.0,Finished,NaT,NaT,NaT
3,2023-05-28 13:03:03.927,0 days 01:02:02.950000,,0.0,0 days 01:03:30.587000,10236.883379,0.0,1,30.0,False,...,Carlos,Sainz,Ferrari,4.0,8.0,4.0,Finished,NaT,NaT,NaT
4,2023-05-28 13:03:03.927,0 days 01:02:02.950000,,0.0,0 days 01:04:03.289000,11255.425011,0.0,1,24.0,False,...,Nico,Hulkenberg,Haas F1 Team,18.0,17.0,0.0,Lapped,NaT,NaT,NaT


In [5]:
# Calculate data frequency from timestamps
if 'SessionTime' in df.columns:
    frequency = calculate_frequency(df['SessionTime'])
    print(f"✅ Data frequency calculated: {frequency:.2f} Hz (avg interval: {1000/frequency:.2f} ms)")
else:
    print("⚠️ Warning: SessionTime column not found, cannot calculate frequency")


INFO:src.utils:Calculated frequency: 7.62 Hz (avg interval: 131.18 ms)


✅ Data frequency calculated: 7.62 Hz (avg interval: 131.18 ms)


In [6]:
# Validate data
required_fields = ['SessionTime', 'DriverID', 'Speed']
is_valid, error_msg = validate_data(df, required_fields)

if is_valid:
    print("✅ Data validation passed")
    print(f"Required fields present: {required_fields}")
else:
    print(f"❌ Data validation failed: {error_msg}")


✅ Data validation passed
Required fields present: ['SessionTime', 'DriverID', 'Speed']


In [7]:
# Export to CSV (Parquet is optional - CSV is sufficient for the pipeline)
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_filename = f"{year}_{grand_prix}_{session}_{timestamp}"

# Export to CSV (required)
csv_path = os.path.join(data_dir, f"{base_filename}.csv")
if export_to_csv(df, csv_path):
    print(f"✅ CSV exported: {csv_path}")
else:
    print("❌ Failed to export CSV")

# Export to Parquet (optional - skip if it fails)
parquet_path = os.path.join(data_dir, f"{base_filename}.parquet")
parquet_success = False
try:
    if export_to_parquet(df, parquet_path):
        print(f"✅ Parquet exported: {parquet_path}")
        parquet_success = True
    else:
        print("⚠️ Parquet export failed (optional - CSV is sufficient)")
except Exception as e:
    print(f"⚠️ Parquet export skipped due to error: {e}")
    print("   Note: CSV file is sufficient for the pipeline")

# Display summary
print(f"\n📊 Data Summary:")
print(f"  Total records: {len(df)}")
print(f"  Drivers: {df['DriverID'].unique().tolist() if 'DriverID' in df.columns else 'N/A'}")
print(f"  Frequency: {frequency:.2f} Hz" if 'frequency' in locals() else "  Frequency: N/A")
print(f"  CSV file: {csv_path} ✅")
if parquet_success:
    print(f"  Parquet file: {parquet_path} ✅")
else:
    print(f"  Parquet file: Skipped (optional)")


INFO:src.data_preparation:Exported 982330 rows, 65 columns to CSV: ../data/2023_Monaco_Race_20251119_182219.csv


✅ CSV exported: ../data/2023_Monaco_Race_20251119_182219.csv


INFO:src.data_preparation:Exported 982330 rows to Parquet: ../data/2023_Monaco_Race_20251119_182219.parquet


✅ Parquet exported: ../data/2023_Monaco_Race_20251119_182219.parquet

📊 Data Summary:
  Total records: 982330
  Drivers: ['VER', 'ALO', 'LEC', 'SAI', 'HUL', 'STR', 'TSU', 'HAM', 'DEV', 'MAG', 'ZHO', 'BOT', 'PER', 'OCO', 'ALB', 'PIA', 'SAR', 'GAS', 'NOR', 'RUS']
  Frequency: 7.62 Hz
  CSV file: ../data/2023_Monaco_Race_20251119_182219.csv ✅
  Parquet file: ../data/2023_Monaco_Race_20251119_182219.parquet ✅


In [8]:
# Validate exported files can be loaded
print("Validating exported files...")

# Load CSV (required)
df_csv = pd.read_csv(csv_path)
print(f"✅ CSV loaded: {len(df_csv)} rows, {len(df_csv.columns)} columns")

# Verify CSV data integrity
if len(df_csv) == len(df):
    print("✅ CSV data integrity verified: Row count matches original data")
else:
    print(f"⚠️ Warning: Row count mismatch (CSV: {len(df_csv)}, Original: {len(df)})")

# Try to load Parquet if it exists (optional validation)
parquet_path_exists = os.path.exists(parquet_path) and os.path.getsize(parquet_path) > 0
if parquet_path_exists:
    try:
        df_parquet = pd.read_parquet(parquet_path, engine='pyarrow')
        print(f"✅ Parquet loaded: {len(df_parquet)} rows, {len(df_parquet.columns)} columns")
        
        # Verify Parquet data integrity
        if len(df_parquet) == len(df):
            print("✅ Parquet data integrity verified: Row count matches original data")
        else:
            print(f"⚠️ Warning: Parquet row count mismatch (Parquet: {len(df_parquet)}, Original: {len(df)})")
    except Exception as e:
        print(f"⚠️ Parquet file exists but could not be loaded: {e}")
        print("   Note: This is optional - CSV file is sufficient for the pipeline")
else:
    print("ℹ️  Parquet file not available (optional - CSV is sufficient)")

print("\n✅ Step 1 Complete: Data preparation successful!")
print("   CSV file is ready for use in subsequent steps (cache setup, Kafka producer, etc.)")


Validating exported files...


  df_csv = pd.read_csv(csv_path)


✅ CSV loaded: 982330 rows, 65 columns
✅ CSV data integrity verified: Row count matches original data
✅ Parquet loaded: 982330 rows, 65 columns
✅ Parquet data integrity verified: Row count matches original data

✅ Step 1 Complete: Data preparation successful!
   CSV file is ready for use in subsequent steps (cache setup, Kafka producer, etc.)
