In [None]:
### After downloading the data, run the following code to summarize the data

In [None]:
### AADT calculation
import os
import pandas as pd
import json
from tqdm import tqdm
import holidays

year = 2017
csv_dir = str(year)

uk_holidays = holidays.country_holidays('UK', years=year)

def process_csv_file(file_path, sensor_id, uk_holidays):
    """
    Compute AADT-like stats from 15-min data:
    - keep only days with 96 intervals (complete days)
    - no weekday/holiday filter (all days)
    - return median, mean, sd, num_valid_days per volume column
    """
    try:
        df = pd.read_csv(file_path)

        # Parse timestamps
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
        df = df.dropna(subset=["Timestamp"])

        df["Date"] = df["Timestamp"].dt.date
        df["Weekday"] = df["Timestamp"].dt.weekday

        # If you want weekday-only, uncomment:
        # df = df[(df["Weekday"] < 5) & (~df["Date"].isin(uk_holidays))]

        volume_cols = ["0 - 520 cm", "521 - 660 cm", "661 - 1160 cm",
                       "1160+ cm", "Total Volume"]
        available_cols = [c for c in volume_cols if c in df.columns]
        if not available_cols:
            return {"sensor_id": sensor_id, "error": "no valid columns"}

        # find complete days: all available columns have 96 intervals
        counts = df.groupby("Date")[available_cols].count().min(axis=1)
        complete_days = counts[counts == 96].index

        df = df[df["Date"].isin(complete_days)]
        if df.empty:
            return None

        summary = {}
        for col in available_cols:
            daily_sum = df.groupby("Date")[col].sum().dropna()
            if daily_sum.count() < 2:
                return None
            summary[col] = {
                "median": float(daily_sum.median()),
                "mean":   float(daily_sum.mean()),
                "sd":     float(daily_sum.std()),
                "num_valid_days": int(daily_sum.count())
            }

        return summary

    except Exception as e:
        return {"sensor_id": sensor_id, "error": str(e)}

sensor_summaries = {}
for file in tqdm(os.listdir(csv_dir), desc="Processing CSV files"):
    if not file.endswith(".csv"):
        continue
    sensor_id = int(os.path.splitext(file)[0])
    file_path = os.path.join(csv_dir, file)
    result = process_csv_file(file_path, sensor_id, uk_holidays)
    if result:
        sensor_summaries[sensor_id] = result

output_json_file = f"AADT_{year}.json"
with open(output_json_file, "w") as f:
    json.dump(sensor_summaries, f, indent=2)

print(f"Summary data saved to {output_json_file}")

Processing CSV files:  69%|██████▉   | 6262/9104 [04:26<01:57, 24.19it/s]