## Imports

In [1]:
import os
import re
import pandas as pd

## Helper

In [2]:
def extract_driver_number(vehicle_id):
    """Extract numeric driver number from a vehicle_id like 'GR86-005-07'."""
    m = re.search(r"-(\d+)$", str(vehicle_id))
    return int(m.group(1)) if m else None

## Trimming Time

In [11]:
# ---------------------- Setup ----------------------
base_dirs = ["../datasets_clean/vir1", "../datasets_clean/vir2"]
output_dir = "../datasets_trimmed/vir_top10"
os.makedirs(output_dir, exist_ok=True)

# ---------------------- Step 1: Collect all lap times ----------------------
all_laps = []

print("üîç Checking for laps with existing telemetry data...")

# Preload telemetry data per folder to filter valid laps later
telemetry_valid = {}

for folder in base_dirs:
    folder_name = os.path.basename(folder)
    telemetry_path = os.path.join(folder, "telemetry_per_timestamp.csv")
    if os.path.exists(telemetry_path):
        tdf = pd.read_csv(telemetry_path)
        # Extract numeric driver number and lap
        tdf["NUMBER"] = pd.to_numeric(
            tdf["vehicle_id"].apply(lambda x: re.search(r"-(\d+)$", x).group(1)),
            errors="coerce"
        )
        tdf["lap"] = pd.to_numeric(tdf["lap"], errors="coerce")
        telemetry_valid[folder_name] = set(zip(tdf["NUMBER"], tdf["lap"]))
        print(f"‚úÖ Loaded telemetry from {folder_name}: {len(tdf)} rows")
    else:
        telemetry_valid[folder_name] = set()
        print(f"‚ö†Ô∏è No telemetry file in {folder_name}")

# Now collect lap times only if telemetry exists
for folder in base_dirs:
    folder_name = os.path.basename(folder)
    for file in os.listdir(folder):
        if not file.startswith("driver_") or not file.endswith(".csv"):
            continue
        path = os.path.join(folder, file)
        df = pd.read_csv(path)
        if "LAP_TIME_SEC" not in df.columns or "LAP_NUMBER" not in df.columns:
            continue

        driver_num_match = re.search(r"driver_(\d+)\.csv", file)
        if not driver_num_match:
            continue
        driver_num = int(driver_num_match.group(1))

        df = df.dropna(subset=["LAP_TIME_SEC"])
        df["SOURCE_DIR"] = folder_name
        df["NUMBER"] = driver_num

        # ‚úÖ Keep only laps that exist in telemetry data for this folder
        valid_laps = telemetry_valid[folder_name]
        df = df[df.apply(lambda r: (r["NUMBER"], r["LAP_NUMBER"]) in valid_laps, axis=1)]

        if not df.empty:
            all_laps.append(df[["NUMBER", "LAP_NUMBER", "LAP_TIME_SEC", "SOURCE_DIR"]])

lap_df = pd.concat(all_laps, ignore_index=True)
lap_df.sort_values("LAP_TIME_SEC", inplace=True)

top10 = lap_df.head(10).reset_index(drop=True)
print("üèÅ Top 10 fastest laps (with existing telemetry):")
print(top10)

# Save combined top 10 lap info
top10_path = os.path.join(output_dir, "top10_lap_times.csv")
top10.to_csv(top10_path, index=False)
print(f"‚úÖ Saved: {top10_path}")

# ---------------------- Step 2: Filter CSVs ----------------------
def filter_csv(folder, filename, top10_only=False):
    path = os.path.join(folder, filename)
    if not os.path.exists(path):
        print(f"‚ö†Ô∏è {filename} missing in {folder}")
        return None

    df = pd.read_csv(path)
    folder_name = os.path.basename(folder)

    if "DriverNumber" in df.columns:
        df.rename(columns={"DriverNumber": "NUMBER"}, inplace=True)

    if "NUMBER" not in df.columns:
        print(f"‚ö†Ô∏è No NUMBER column in {filename}, skipping.")
        return None

    if top10_only:
        df["NUMBER"] = pd.to_numeric(df["NUMBER"], errors="coerce")
        df["lap"] = pd.to_numeric(df["lap"], errors="coerce")

        valid_set = set(zip(
            pd.to_numeric(top10["NUMBER"], errors="coerce"),
            pd.to_numeric(top10["LAP_NUMBER"], errors="coerce")
        ))

        for i, row in top10.iterrows():
            driver, lap_num = row["NUMBER"], row["LAP_NUMBER"]
            match_count = df[(df["NUMBER"] == driver) & (df["lap"] == lap_num)].shape[0]
            print(f"‚úÖ Match found for driver {driver}, lap {lap_num}, folder {folder_name} ‚Üí {match_count} rows")

        df = df[df.apply(lambda r: (r["NUMBER"], r["lap"]) in valid_set, axis=1)]

    df["SOURCE_DIR"] = folder_name
    print(f"‚úÖ {filename} ({folder_name}): {len(df)} rows after filtering")
    return df

# ---------------------- Step 3: Process files ----------------------
files_to_process = ["telemetry_per_timestamp.csv", "driver_session_stats.csv"]

for fname in files_to_process:
    combined = []
    top10_only = (fname == "telemetry_per_timestamp.csv")

    for folder in base_dirs:
        df_filtered = filter_csv(folder, fname, top10_only=top10_only)
        if df_filtered is not None and not df_filtered.empty:
            combined.append(df_filtered)

    if combined:
        final_df = pd.concat(combined, ignore_index=True)
        out_path = os.path.join(output_dir, f"top10_{fname}" if top10_only else fname)
        final_df.to_csv(out_path, index=False)
        print(f"üíæ Saved {fname} ‚Üí {out_path}")

print("\nüéØ All CSVs processed successfully!")

üîç Checking for laps with existing telemetry data...
‚úÖ Loaded telemetry from vir1: 859241 rows
‚úÖ Loaded telemetry from vir2: 1135421 rows
üèÅ Top 10 fastest laps (with existing telemetry):
   NUMBER  LAP_NUMBER  LAP_TIME_SEC SOURCE_DIR
0      13          10       128.432       vir1
1      13          11       128.485       vir1
2      55           9       128.497       vir1
3      13           5       128.501       vir2
4      46           7       128.501       vir2
5      72          10       128.511       vir1
6      46           9       128.542       vir1
7      13           6       128.584       vir1
8      46          10       128.600       vir1
9      72           9       128.610       vir1
‚úÖ Saved: ../datasets_trimmed/vir_top10\top10_lap_times.csv
‚úÖ Match found for driver 13, lap 10, folder vir1 ‚Üí 3071 rows
‚úÖ Match found for driver 13, lap 11, folder vir1 ‚Üí 3092 rows
‚úÖ Match found for driver 55, lap 9, folder vir1 ‚Üí 3054 rows
‚úÖ Match found for driver 13, l