In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os
import zipfile

# ---------------------------- CONFIG ----------------------------
random.seed(100)
np.random.seed(100)

start_date_range = datetime(1997, 1, 1)
end_date_range = datetime(2025, 1, 1)
total_projects = 500

phase_templates = [
    ("Site Prep", 0.10, 0.10),
    ("Foundation", 0.22, 0.20),
    ("Structural", 0.28, 0.27),
    ("Enclosure", 0.18, 0.20),
    ("Interiors", 0.20, 0.23),
]
trades = ["Excavation", "Concrete", "Carpentry", "Steel Erection", "Electrical", "Plumbing", "HVAC", "Finishes"]
equipment_types = ["Excavator", "Crane", "Loader", "Bulldozer", "Forklift", "Generator"]
suppliers = ["NorthStar Materials", "RapidSupply", "MetroBuild", "Cascade Steel"]
co_reasons = ["Scope change", "Unforeseen conditions", "Client request", "Design revision"]

# ---------------------------- HELPERS ----------------------------
def clamp(val, lo, hi):
    return max(lo, min(hi, val))

def daterange(start, end, step_days=1):
    for n in range(0, (end - start).days + 1, step_days):
        yield start + timedelta(days=n)

# ---------------------------- DATA CONTAINERS ----------------------------
projects, phases, labor, materials, equipment, weather, change_orders, scenarios = ([] for _ in range(8))

today = datetime(2025, 8, 16)

# ---------------------------- DATA GENERATION ----------------------------
for pid in range(1000, 1000 + total_projects):
    start_days_offset = np.random.randint(0, (end_date_range - start_date_range).days)
    start_date = start_date_range + timedelta(days=int(start_days_offset))
    duration_days = np.random.randint(240, 720)
    planned_end = start_date + timedelta(days=duration_days)
    planned_budget = int(np.random.uniform(1.5e6, 15e6))
    progress = clamp((today - start_date).days / duration_days, 0, 1)
    complete = progress >= 1.0
    cost_variance = np.random.normal(0.05, 0.03)
    actual_cost = int(planned_budget * (progress + cost_variance))
    actual_end = planned_end + timedelta(days=np.random.randint(5, 40)) if complete else ""
    completion_pct = 100 if complete else int(progress * 100)

    projects.append({
        "project_id": pid,
        "project_name": f"Project {pid}",
        "start_date": start_date.date(),
        "planned_end": planned_end.date(),
        "actual_end": actual_end,
        "planned_budget": planned_budget,
        "actual_cost": actual_cost,
        "completion_pct": completion_pct
    })

    phase_start = start_date
    for phase_id, (pname, cost_share, hour_share) in enumerate(phase_templates, 1):
        phase_duration = int(duration_days * (0.15 + np.random.rand() * 0.1))
        planned_end_phase = phase_start + timedelta(days=phase_duration)
        planned_cost = int(planned_budget * cost_share)
        planned_hours = int((planned_budget / 1000) * hour_share)
        progress_ratio = progress if not complete else 1.0
        actual_cost_p = int(planned_cost * (progress_ratio + np.random.normal(0.03, 0.03)))
        actual_hours_p = int(planned_hours * (progress_ratio + np.random.normal(0.02, 0.03)))
        actual_end_phase = planned_end_phase + timedelta(days=np.random.randint(-5, 15)) if progress_ratio > 0.95 else ""

        phases.append({
            "project_id": pid, "phase_id": phase_id, "phase_name": pname,
            "planned_start": phase_start.date(), "planned_end": planned_end_phase.date(),
            "actual_start": phase_start.date(), "actual_end": actual_end_phase,
            "planned_cost": planned_cost, "actual_cost": actual_cost_p,
            "planned_hours": planned_hours, "actual_hours": actual_hours_p,
            "weather_delay_days": max(0, int(np.random.normal(2, 2)))
        })

        for trade in random.sample(trades, 2):
            est = int(planned_hours / 3)
            labor.append({
                "project_id": pid, "phase_id": phase_id, "trade": trade,
                "subcontractor": f"{trade} Co", "est_hours": est,
                "act_hours": int(est * np.random.uniform(0.9, 1.1)),
                "hourly_rate": np.random.randint(50, 110),
                "safety_incidents": np.random.poisson(0.3)
            })

        for _ in range(2):
            materials.append({
                "project_id": pid, "phase_id": phase_id,
                "supplier": random.choice(suppliers),
                "item": random.choice(["Concrete", "Steel", "Lumber"]),
                "qty": np.random.randint(20, 100),
                "planned_cost": np.random.randint(8000, 25000),
                "actual_cost": np.random.randint(8500, 28000),
                "delivery_date": phase_start.date(),
                "delivery_delay_days": np.random.randint(0, 6)
            })

        for eq in random.sample(equipment_types, 1):
            equipment.append({
                "project_id": pid, "phase_id": phase_id,
                "equipment_id": f"{eq[:2].upper()}-{random.randint(100,999)}",
                "type": eq, "planned_hours": np.random.randint(80, 160),
                "actual_hours": np.random.randint(70, 180),
                "downtime_hours": np.random.randint(0, 20)
            })

        for d in daterange(phase_start, planned_end_phase, step_days=30):
            weather.append({
                "project_id": pid, "date": d.date(),
                "weather_delay_hours": np.random.poisson(1.5)
            })

        for _ in range(np.random.poisson(0.8)):
            change_orders.append({
                "project_id": pid, "phase_id": phase_id,
                "co_id": f"CO-{pid}-{phase_id}-{random.randint(100,999)}",
                "co_cost": np.random.randint(10000, 50000),
                "co_reason": random.choice(co_reasons),
                "date": (phase_start + timedelta(days=np.random.randint(10, 30))).date()
            })

        phase_start = planned_end_phase + timedelta(days=1)

    if completion_pct < 100:
        for sc in ["A_StatusQuo", "B_SecondShift", "C_SupplierSwitch"]:
            scenarios.append({
                "project_id": pid, "scenario": sc,
                "expected_finish_date": str(planned_end + timedelta(days=np.random.randint(0, 30))),
                "expected_total_cost": actual_cost + np.random.randint(150000, 300000),
                "risk_score": np.random.randint(30, 70),
                "notes": "Simulated scenario for demo purposes"
            })

# ---------------------------- SAVE TO FILES ----------------------------
output_dir = "construction_25yr"
os.makedirs(output_dir, exist_ok=True)
datasets = {
    "projects.csv": pd.DataFrame(projects),
    "phases.csv": pd.DataFrame(phases),
    "labor.csv": pd.DataFrame(labor),
    "materials.csv": pd.DataFrame(materials),
    "equipment.csv": pd.DataFrame(equipment),
    "weather.csv": pd.DataFrame(weather),
    "change_orders.csv": pd.DataFrame(change_orders),
    "scenarios.csv": pd.DataFrame(scenarios),
}
for fname, df in datasets.items():
    df.to_csv(os.path.join(output_dir, fname), index=False)

print("✅ Synthetic 25-year dataset generated and saved.")


✅ Synthetic 25-year dataset generated and saved.
