## Imports

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

## Config

In [None]:
RAW_BASE = "../datasets/circuit-of-the-americas/COTA/Race 1"
OUTPUT_ROOT = os.path.abspath("../analysis_by_driver_vec")
os.makedirs(OUTPUT_ROOT, exist_ok=True)

ANALYSIS_FILE = os.path.join(RAW_BASE, "23_AnalysisEnduranceWithSections_Race 1_Anonymized.csv")
WEATHER_FILE = os.path.join(RAW_BASE, "26_Weather_Race 1_Anonymized.csv")
TELEMETRY_FILE = os.path.join(RAW_BASE, "R1_cota_telemetry_data.csv")

LAP_START_FILE = os.path.join(RAW_BASE, "COTA_lap_start_time_R1.csv")
LAP_END_FILE = os.path.join(RAW_BASE, "COTA_lap_end_time_R1.csv")
LAP_TIME_FILE = os.path.join(RAW_BASE, "COTA_lap_time_R1.csv")

## Utility

In [None]:
def time_to_seconds(t):
    """Convert strings like 'M:SS.mmm' or 'H:MM:SS.mmm' or 'MM:SS' to seconds (float).
    Returns NaN for missing or unparsable inputs."""
    if pd.isna(t):
        return np.nan
    s = str(t).strip()
    if s == '':
        return np.nan
    # replace comma decimal separators
    s = s.replace(',', '.')
    parts = s.split(':')
    try:
        if len(parts) == 1:
            return float(parts[0])
        if len(parts) == 2:
            m, sec = parts
            return float(m) * 60.0 + float(sec)
        if len(parts) == 3:
            h, m, sec = parts
            return float(h) * 3600.0 + float(m) * 60.0 + float(sec)
    except Exception:
        return np.nan

def clean_column_names(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

## Cleaning Time

In [None]:
# ---------------------- 1) Load & clean "analysis with sections" ----------------------
print("Loading analysis file:", ANALYSIS_FILE)
analysis_df = pd.read_csv(ANALYSIS_FILE, sep=';', dtype=str)
analysis_df = clean_column_names(analysis_df)
print("Raw columns:", analysis_df.columns.tolist())

# Drop fully-empty columns
analysis_df = analysis_df.dropna(how='all', axis=1)

# Drop unwanted columns to make files readable in Excel
for drop_col in ['CLASS', 'GROUP', 'MANUFACTURER']:
    if drop_col in analysis_df.columns:
        analysis_df = analysis_df.drop(columns=[drop_col])

# Convert numeric-ish columns
for col in ['NUMBER', 'DRIVER_NUMBER', 'LAP_NUMBER', 'KPH', 'TOP_SPEED', 'PIT_TIME']:
    if col in analysis_df.columns:
        analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

# Convert LAP times and sector strings to seconds
for col in ['LAP_TIME', 'S1', 'S2', 'S3', 'S1_SECONDS', 'S2_SECONDS', 'S3_SECONDS']:
    if col in analysis_df.columns:
        # prefer S?_SECONDS if present numeric; otherwise convert textual S1 etc.
        if col.endswith('_SECONDS'):
            analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')
        else:
            analysis_df[col + '_SEC'] = analysis_df[col].apply(time_to_seconds)

# If explicit seconds columns exist but not the _SEC ones, copy them
for s_col in ['S1_SECONDS', 'S2_SECONDS', 'S3_SECONDS']:
    if s_col in analysis_df.columns and (s_col.replace('_SECONDS', '') + '_SEC') not in analysis_df.columns:
        analysis_df[s_col.replace('_SECONDS', '') + '_SEC'] = pd.to_numeric(analysis_df[s_col], errors='coerce')

# Normalize flag column
if 'FLAG_AT_FL' in analysis_df.columns:
    analysis_df['FLAG_AT_FL'] = analysis_df['FLAG_AT_FL'].astype(str).str.strip()

# Create a numeric Lap_Time if not present
if 'LAP_TIME_SEC' not in analysis_df.columns and 'LAP_TIME' in analysis_df.columns:
    analysis_df['LAP_TIME_SEC'] = analysis_df['LAP_TIME'].apply(time_to_seconds)

print("Loaded analysis rows:", len(analysis_df))

# ---------------------- 2) Filter out unwanted laps ----------------------
# Remove laps under FCY/SC/AUTO flags and laps that finished in pit
bad_flags = set(['FCY', 'SC', 'YELLOW'])
if 'FLAG_AT_FL' in analysis_df.columns:
    before = len(analysis_df)
    analysis_df = analysis_df[~analysis_df['FLAG_AT_FL'].isin(bad_flags)]
    print(f"Filtered by flag: {before} -> {len(analysis_df)}")

# Proper pit filtering: 'B' marks a pit entry lap, '0' means normal lap
if 'CROSSING_FINISH_LINE_IN_PIT' in analysis_df.columns:
    before = len(analysis_df)
    analysis_df['CROSSING_FINISH_LINE_IN_PIT'] = analysis_df['CROSSING_FINISH_LINE_IN_PIT'].astype(str).str.strip()
    analysis_df = analysis_df[analysis_df['CROSSING_FINISH_LINE_IN_PIT'] != 'B']
    print(f"Filtered pit-crossing laps: {before} -> {len(analysis_df)}")

# Remove clearly invalid lap times
if 'LAP_TIME_SEC' in analysis_df.columns:
    before = len(analysis_df)
    analysis_df = analysis_df[pd.to_numeric(analysis_df['LAP_TIME_SEC'], errors='coerce') > 5.0] # remove zeros and extremely small
    analysis_df = analysis_df[pd.to_numeric(analysis_df['LAP_TIME_SEC'], errors='coerce') < 3600.0] # sanity upper bound
    print(f"Filtered by lap time sanity: {before} -> {len(analysis_df)}")

# ---------------------- 3) Save cleaned per-driver CSVs (compact columns) ----------------------
# Select a useful subset of columns for driver-coaching CSVs
default_cols = [
'NUMBER','DRIVER_NUMBER','LAP_NUMBER','LAP_TIME','LAP_TIME_SEC',
'S1','S1_SEC','S2','S2_SEC','S3','S3_SEC',
'KPH','TOP_SPEED','ELAPSED','HOUR','FLAG_AT_FL'
]
cols_to_save = [c for c in default_cols if c in analysis_df.columns]
print('Columns saved per driver:', cols_to_save)

# Ensure NUMBER is numeric for filename safety
if 'NUMBER' in analysis_df.columns:
    analysis_df['NUMBER'] = pd.to_numeric(analysis_df['NUMBER'], errors='coerce')

for num, grp in analysis_df.groupby('NUMBER'):
    safe_num = int(num) if not pd.isna(num) else 'unknown'
    out_path = os.path.join(OUTPUT_ROOT, f"driver_{safe_num}.csv")
    grp[cols_to_save].to_csv(out_path, index=False, sep=',')
    print(f"Saved driver file: {out_path} ({len(grp)} laps)")

# ---------------------- 4) Load telemetry (if present) and pivot into wide form ----------------------
if os.path.exists(TELEMETRY_FILE):
    print('Loading telemetry (this may be large):', TELEMETRY_FILE)
    tel_df = pd.read_csv(TELEMETRY_FILE)
    tel_df = clean_column_names(tel_df)
    # Parse times
    for tcol in ['meta_time','timestamp']:
        if tcol in tel_df.columns:
            tel_df[tcol] = pd.to_datetime(tel_df[tcol], errors='coerce', utc=True)
    # Ensure values numeric when possible
    tel_df['telemetry_value'] = pd.to_numeric(tel_df['telemetry_value'], errors='coerce')

    # Pivot: make each telemetry_name a column per vehicle_id + timestamp
    index_cols = [c for c in ['vehicle_id','meta_time','timestamp','lap'] if c in tel_df.columns]
    print('Telemetry index cols:', index_cols)
    tel_wide = tel_df.pivot_table(index=index_cols, columns='telemetry_name', values='telemetry_value', aggfunc='first').reset_index()
    print('Telemetry wide shape:', tel_wide.shape)

    # Save per-vehicle telemetry sampled per-lap timestamp rows (if lap info exists)
    if 'vehicle_id' in tel_wide.columns:
        telem_out = os.path.join(OUTPUT_ROOT, 'telemetry_per_timestamp.csv')
        tel_wide.to_csv(telem_out, index=False)
        print('Saved telemetry per-timestamp wide file to:', telem_out)
else:
    print('No telemetry file found at', TELEMETRY_FILE)

# ---------------------- 5) Compute per-lap telemetry summaries (vectorized) ----------------------
print("Generating per-lap and per-sector telemetry summary for MAD-filtered laps...")

if os.path.exists(TELEMETRY_FILE):
    tel_df = pd.read_csv(TELEMETRY_FILE)
    tel_df = clean_column_names(tel_df)
    tel_df['timestamp'] = pd.to_datetime(tel_df['timestamp'], errors='coerce', utc=True)
    tel_df['vehicle_id'] = tel_df['vehicle_id'].astype(str)
    tel_df = tel_df.dropna(subset=['timestamp', 'telemetry_value'])
    tel_df['telemetry_value'] = pd.to_numeric(tel_df['telemetry_value'], errors='coerce')

    # Keep only valid laps from driver_session_stats
    driver_stats_path = os.path.join(OUTPUT_ROOT, "driver_session_stats.csv")
    if os.path.exists(driver_stats_path):
        valid_laps_df = pd.read_csv(driver_stats_path)
        valid_pairs = analysis_df.merge(
            valid_laps_df[['DriverNumber']],
            left_on='NUMBER', right_on='DriverNumber',
            how='inner'
        )
        valid_laps = set(zip(valid_pairs['NUMBER'].astype(str), valid_pairs['LAP_NUMBER']))
        tel_df = tel_df[tel_df.apply(lambda r: (r['vehicle_id'], r.get('lap', np.nan)) in valid_laps, axis=1)]

    # Compute per-lap, per-sector metrics
    per_lap_list = []
    for (vid, lap), group in tel_df.groupby(['vehicle_id','lap']):
        lap_metrics = {'vehicle_id': vid, 'lap': lap}

        # Whole-lap metrics
        lap_metrics['mean_throttle'] = group.loc[group['telemetry_name'].isin(['ath','aps','throttle']),'telemetry_value'].mean()
        lap_metrics['mean_brake'] = group.loc[group['telemetry_name'].isin(['pbrake_f','pbrake_r','brake']),'telemetry_value'].mean()
        sa = group.loc[group['telemetry_name']=='Steering_Angle','telemetry_value']
        lap_metrics['steering_smoothness'] = sa.diff().abs().mean() if not sa.empty else np.nan

        # Optional: sector metrics if sector telemetry columns exist
        for sector in ['S1','S2','S3']:
            sector_mask = group['telemetry_name'].str.contains(sector, case=False)
            if sector_mask.any():
                lap_metrics[f'{sector}_mean_throttle'] = group.loc[group['telemetry_name'].isin(['ath','aps','throttle']) & sector_mask,'telemetry_value'].mean()
                lap_metrics[f'{sector}_mean_brake'] = group.loc[group['telemetry_name'].isin(['pbrake_f','pbrake_r','brake']) & sector_mask,'telemetry_value'].mean()
                sa_sector = group.loc[sector_mask & (group['telemetry_name']=='Steering_Angle'),'telemetry_value']
                lap_metrics[f'{sector}_steering_smoothness'] = sa_sector.diff().abs().mean() if not sa_sector.empty else np.nan

        per_lap_list.append(lap_metrics)

    if per_lap_list:
        per_lap_df = pd.DataFrame(per_lap_list)
        per_lap_path = os.path.join(OUTPUT_ROOT, 'per_lap_telemetry_summary.csv')
        per_lap_df.to_csv(per_lap_path, index=False)
        print('✅ Saved per-lap telemetry summary to:', per_lap_path)
else:
    print('⚠️ No telemetry data matched any valid MAD-filtered laps.')

# ---------------------- 6) Quick driver session statistics (filter extreme laps) ----------------------
print("\nComputing driver session statistics with sanity filtering...")

# Ensure numeric
for col in ['LAP_TIME_SEC', 'S1_SEC', 'S2_SEC', 'S3_SEC']:
    if col in analysis_df.columns:
        analysis_df[col] = pd.to_numeric(analysis_df[col], errors='coerce')

# Filter obviously invalid or extreme laps (more than 15 sec slower than driver mean)
if 'NUMBER' in analysis_df.columns:
    filtered_df_list = []
    for driver, grp in analysis_df.groupby('NUMBER'):
        grp = grp.copy()
        median = grp['LAP_TIME_SEC'].median()
        mad = (grp['LAP_TIME_SEC'] - median).abs().median()
        cutoff = median + 3 * mad  # 3 MADs above median

        # Debug info
        print(f"\nDriver {driver}:")
        print(f"  Number of laps before filtering: {len(grp)}")
        print(f"  Median lap time: {median:.3f} sec")
        print(f"  MAD (Median Abs Deviation): {mad:.3f} sec")
        print(f"  Cutoff for valid laps: {cutoff:.3f} sec")

        # Apply filtering
        filtered_grp = grp[grp['LAP_TIME_SEC'] <= cutoff]
        print(f"  Number of laps after filtering: {len(filtered_grp)}")
        if len(filtered_grp) < len(grp):
            removed = len(grp) - len(filtered_grp)
            print(f"  → Removed {removed} lap(s) as outliers.")

        filtered_df_list.append(filtered_grp)

    valid_laps_df = pd.concat(filtered_df_list, ignore_index=True)
else:
    valid_laps_df = analysis_df.copy()

# Compute per-driver stats
agg_dict = {
    'LAP_TIME_SEC': ['count','min','mean','std'],
    'S1_SEC': 'min', 'S2_SEC': 'min', 'S3_SEC': 'min'
}
driver_stats_df = valid_laps_df.groupby('NUMBER', as_index=False).agg(agg_dict)

# Flatten columns
driver_stats_df.columns = [
    'DriverNumber','Laps','BestLap(s)','AvgLap(s)','StdDev(s)',
    'S1Best','S2Best','S3Best'
]
driver_stats_df['TheoreticalBest(s)'] = driver_stats_df[['S1Best','S2Best','S3Best']].sum(axis=1)

# Round
driver_stats_df[['BestLap(s)','AvgLap(s)','StdDev(s)','TheoreticalBest(s)']] = \
    driver_stats_df[['BestLap(s)','AvgLap(s)','StdDev(s)','TheoreticalBest(s)']].round(3)

# Save CSV
driver_stats_path = os.path.join(OUTPUT_ROOT, "driver_session_stats.csv")
driver_stats_df.to_csv(driver_stats_path, index=False)
print(f"✅ Saved driver session stats to: {driver_stats_path}")
display(driver_stats_df)

In [None]:
# ---------------------- Filter extreme laps using MAD per driver ----------------------
if 'NUMBER' in analysis_df.columns:
    filtered_df_list = []
    driver_filter_summary = []  # keep record for debugging/inspection

    for driver, grp in analysis_df.groupby('NUMBER'):
        grp = grp.copy()
        median = grp['LAP_TIME_SEC'].median()
        mad = (grp['LAP_TIME_SEC'] - median).abs().median()
        cutoff = median + 3 * mad  # 3 MADs above median

        # Filter outliers
        filtered_grp = grp[grp['LAP_TIME_SEC'] <= cutoff]

        # Collect summary info
        driver_filter_summary.append({
            'DriverNumber': driver,
            'TotalLaps': len(grp),
            'ValidLaps': len(filtered_grp),
            'RemovedLaps': len(grp) - len(filtered_grp),
            'MedianLap(s)': median,
            'MAD(s)': mad,
            'Cutoff(s)': cutoff
        })

        print(f"\nDriver {driver}:")
        print(f"  Total laps: {len(grp)} | Valid: {len(filtered_grp)} | Removed: {len(grp) - len(filtered_grp)}")
        print(f"  Median: {median:.3f}s | MAD: {mad:.3f}s | Cutoff: {cutoff:.3f}s")

        # Save filtered laps to individual CSV
        safe_num = int(driver) if not pd.isna(driver) else 'unknown'
        out_path = os.path.join(OUTPUT_ROOT, f"driver_{safe_num}.csv")
        filtered_grp.to_csv(out_path, index=False)
        print(f"  ✅ Saved filtered driver file: {out_path} ({len(filtered_grp)} valid laps)")

        filtered_df_list.append(filtered_grp)

    # Combine all valid laps for session-level stats
    valid_laps_df = pd.concat(filtered_df_list, ignore_index=True)

    # Optional summary overview
    filter_summary_df = pd.DataFrame(driver_filter_summary)
    print("\nSummary of filtering across drivers:")
    display(filter_summary_df)
else:
    valid_laps_df = analysis_df.copy()

# ---------------------- Compute per-driver statistics (only valid laps) ----------------------
agg_dict = {
    'LAP_TIME_SEC': ['count','min','mean','std'],
    'S1_SEC': 'min', 'S2_SEC': 'min', 'S3_SEC': 'min'
}

driver_stats_df = valid_laps_df.groupby('NUMBER', as_index=False).agg(agg_dict)
driver_stats_df.columns = [
    'DriverNumber','Laps','BestLap(s)','AvgLap(s)','StdDev(s)',
    'S1Best','S2Best','S3Best'
]
driver_stats_df['TheoreticalBest(s)'] = driver_stats_df[['S1Best','S2Best','S3Best']].sum(axis=1)

# Round for readability
driver_stats_df[['BestLap(s)','AvgLap(s)','StdDev(s)','TheoreticalBest(s)']] = \
    driver_stats_df[['BestLap(s)','AvgLap(s)','StdDev(s)','TheoreticalBest(s)']].round(3)

# Save updated session-level stats
driver_stats_path = os.path.join(OUTPUT_ROOT, "driver_session_stats.csv")
driver_stats_df.to_csv(driver_stats_path, index=False)
print(f"\n✅ Saved filtered driver session stats to: {driver_stats_path}")
display(driver_stats_df)


In [None]:
telemetry_file = "../analysis_by_driver_vec/per_lap_telemetry_summary.csv"

# Load the telemetry summary
telemetry_df = pd.read_csv(telemetry_file)

# Extract driver number from vehicle_id (last number after dash)
telemetry_df['NUMBER'] = telemetry_df['vehicle_id'].str.split('-').str[-1].astype(int)

filtered_list = []

for driver, grp in telemetry_df.groupby('NUMBER'):
    median = grp['mean_throttle'].median()
    mad = (grp['mean_throttle'] - median).abs().median()
    # Define low-throttle cutoff (e.g., median - 3*MAD)
    cutoff = median - 3 * mad
    grp_filtered = grp[grp['mean_throttle'] >= cutoff]
    filtered_list.append(grp_filtered)
    print(f"Driver {driver}: median={median:.2f}, MAD={mad:.2f}, cutoff={cutoff:.2f}, removed={len(grp)-len(grp_filtered)} laps")

# Concatenate filtered groups
filtered_telemetry_df = pd.concat(filtered_list, ignore_index=True)

# Overwrite the original CSV
filtered_telemetry_df.to_csv(telemetry_file, index=False)

print(f"Filtered telemetry saved. Remaining rows: {len(filtered_telemetry_df)}")