# Data Overview

This notebook explores the Munic telematics CSV exports to understand available signals, missingness patterns, and sampling properties. Findings will inform preprocessing rules for the modeling pipeline.


In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

DATA_DIR = Path("../fuel_data").resolve()
FILES = sorted(DATA_DIR.glob("*.csv"))
print(f"CSV files detected: {len(FILES)}")
FILES[:5]


In [None]:
def load_sample(file_path: Path, nrows: int = 5) -> pd.DataFrame:
    df = pd.read_csv(file_path, nrows=nrows)
    df.columns = [c.strip() for c in df.columns]
    return df

sample_df = load_sample(FILES[0], nrows=10)
sample_df


In [None]:
sample_df.info()


In [None]:
def summarize_missingness(file_path: Path, max_rows: int = 100000) -> pd.DataFrame:
    df = pd.read_csv(file_path, nrows=max_rows)
    df.columns = [c.strip() for c in df.columns]
    summary = (
        df.isna()
        .mean()
        .rename("missing_ratio")
        .to_frame()
        .assign(non_null_count=df.notna().sum())
    )
    return summary

missing_summary = summarize_missingness(FILES[0])
missing_summary


In [None]:
def compute_sampling_stats(file_path: Path, max_rows: int = 200000) -> pd.Series:
    df = pd.read_csv(file_path, usecols=["time"], nrows=max_rows)
    df["time"] = pd.to_datetime(df["time"], utc=True)
    df = df.sort_values("time")
    deltas = df["time"].diff().dropna().dt.total_seconds()
    return pd.Series(
        {
            "median_dt": deltas.median(),
            "mean_dt": deltas.mean(),
            "std_dt": deltas.std(),
            "min_dt": deltas.min(),
            "max_dt": deltas.max(),
            "n_samples": len(df),
        }
    )

sampling_stats = compute_sampling_stats(FILES[0])
sampling_stats


In [None]:
def get_available_columns(file_paths, sample_size: int = 10) -> pd.Series:
    columns_counts = {}
    for file_path in file_paths[:sample_size]:
        cols = pd.read_csv(file_path, nrows=1).columns.str.strip()
        for col in cols:
            columns_counts[col] = columns_counts.get(col, 0) + 1
    return pd.Series(columns_counts).sort_values(ascending=False)

column_presence = get_available_columns(FILES, sample_size=50)
column_presence


## Cleaning Decisions

- Parse `time` and `received_at` as UTC timestamps; keep `time` as primary ordering key.
- Use `TRACKS.MUNIC.GPS_SPEED (km/h)` as the canonical speed signal; fallback to `TRACKS.MUNIC.MDI_OBD_SPEED (km/h)` when GPS is missing but OBD is present.
- Treat fuel metrics as cumulative (monotonic) counters; derive instantaneous consumption by differentiating after smoothing and clipping non-positive changes.
- Remove sensor bursts with implausible values (negative speeds, fuel jumps >99th percentile) and forward-fill small gaps (â‰¤5s) before feature engineering.
- Resample trajectories to a uniform cadence (default 1 Hz) after interpolation to support downstream modeling and frequency studies.
