In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# --------------- CONFIGURATION ---------------

roads = [
    'A1', 'A2', 'A3', 'A4', 'A10', 'A12', 'A13', 'A20', 'A21', 'A23', 'A24', 'A40', 'A41', 'A316',
    'North Circular (A406)', 'South Circular (A205)', 'Bishopsgate Cross Route', 'Blackwall Tunnel',
    'City Route', 'Farringdon Cross Route', 'Inner Ring', 'Southern River Route', 'Western Cross Route'
]

status_map = {
    0: ("Good", "No Exceptional Delays"),
    1: ("Minor", "Minor Delays"),
    2: ("Serious", "Serious Delays")
}

# date range
start_date = datetime(2025, 3, 10)
end_date = datetime(2025, 5, 20)

# --------------- FUNCTION TO SIMULATE ONE DAY ---------------

def generate_day_data(date):
    day_data = []
    first_minute = random.randint(0, 29)
    first_time = datetime(date.year, date.month, date.day, 0, first_minute)
    timestamps = [first_time + timedelta(minutes=30 * i) for i in range(int(24*60/30))]

    for timestamp in timestamps:
        hour = timestamp.hour
        weekday = timestamp.weekday()

        for road in roads:
            if weekday < 5:
                if 7 <= hour <= 9 or 16 <= hour <= 18:
                    probs = [0.6, 0.25, 0.15]
                elif 6 <= hour <= 22:
                    probs = [0.8, 0.15, 0.05]
                else:
                    probs = [0.95, 0.05, 0.0]
            else:
                if 12 <= hour <= 18:
                    probs = [0.75, 0.20, 0.05]
                else:
                    probs = [0.9, 0.08, 0.02]

            if road in ['A13', 'A40', 'North Circular (A406)', 'City Route']:
                probs = [
                    probs[0] - 0.1 if probs[0] > 0.1 else 0,
                    probs[1] + 0.07,
                    probs[2] + 0.03
                ]
                total = sum(probs)
                probs = [p/total for p in probs]

            sev = np.random.choice([0,1,2], p=probs)
            status, description = status_map[sev]

            day_data.append({
                "road": road,
                "status": status,
                "description": description,
                "timestamp": timestamp.isoformat()
            })
    return day_data

# --------------- GENERATE FULL DATASET ---------------

all_data = []
current_date = start_date
while current_date <= end_date:
    all_data.extend(generate_day_data(current_date))
    current_date += timedelta(days=1)

df = pd.DataFrame(all_data)
df = df.sort_values(by='timestamp').reset_index(drop=True)

# SAVE
df.to_csv("../data/synthetic_tfl_data.csv", index=False)
print("Synthetic dataset successfully generated.")


Synthetic dataset successfully generated.
