In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

CONFIG = {
    "n_vehicles": 100_000,
    "start_year": 2010,
    "end_year": 2025
}

years = list(range(CONFIG["start_year"], CONFIG["end_year"] + 1))


In [2]:
countries = [
    "USA",
    "Germany",
    "Japan",
    "India",
    "China",
    "UK",
    "France",
    "Brazil",
    "South Korea",
    "Canada"
]

In [4]:
brands = [
    "Toyota", "Honda", "Mazda", "Nissan",
    "Volkswagen", "BMW", "Mercedes", "Audi",
    "Ford", "Chevrolet",
    "Hyundai", "Kia",
    "Tata", "Mahindra",
    "BYD", "Geely",
    "Renault", "Peugeot",
    "Volvo", "Tesla",
    "Suzuki"
]


In [5]:
regional_rows = []

for country in countries:
    base_stress = np.random.uniform(0.3, 0.8)
    maintenance = np.random.uniform(0.4, 0.9)
    traffic = np.random.uniform(0.3, 0.9)

    for year in years:
        regional_rows.append({
            "country": country,
            "year": year,
            "regional_risk_index": base_stress,
            "maintenance_culture": maintenance,
            "traffic_density": traffic
        })

regional_environment_df = pd.DataFrame(regional_rows)


In [6]:
vehicle_master_rows = []

for vid in range(CONFIG["n_vehicles"]):
    brand = np.random.choice(brands)
    country = np.random.choice(countries)
    manufacture_year = np.random.randint(CONFIG["start_year"], CONFIG["end_year"] + 1)
    
    fuel_type = np.random.choice(["ICE", "EV"], p=[0.85, 0.15])
    vehicle_class = np.random.choice(["A", "B", "C"])
    
    vehicle_master_rows.append({
        "vehicle_id": vid,
        "brand": brand,
        "initial_country": country,
        "manufacturing_year": manufacture_year,
        "fuel_type": fuel_type,
        "vehicle_class": vehicle_class,
        "power_rel_init": np.random.uniform(0.7, 1.0),
        "motion_rel_init": np.random.uniform(0.7, 1.0),
        "control_rel_init": np.random.uniform(0.7, 1.0),
        "struct_rel_init": np.random.uniform(0.7, 1.0)
    })

vehicle_master_df = pd.DataFrame(vehicle_master_rows)


In [7]:
def failure_probability(health, stress, age, maintenance):
    base = (1 - health)
    return min(0.5, base * 0.4 + stress * 0.3 + age * 0.01 - maintenance * 0.2)

def accident_probability(regional_risk, traffic_density, age):
    base = 0.02
    return min(0.25, base + regional_risk * 0.15 + traffic_density * 0.10 + age * 0.01)

def accident_severity():
    return np.random.choice([1, 2, 3], p=[0.70, 0.20, 0.10])

def under_warranty(manufacture_year, current_year, cumulative_mileage,
                   std_years=3, std_km=100000):
    years_active = current_year - manufacture_year
    return (years_active <= std_years) and (cumulative_mileage <= std_km)


In [8]:
lifecycle_rows = []
accident_rows = []

for _, vehicle in vehicle_master_df.iterrows():
    
    vehicle_id = vehicle["vehicle_id"]
    manufacture_year = vehicle["manufacturing_year"]
    
    current_power = vehicle["power_rel_init"]
    current_motion = vehicle["motion_rel_init"]
    current_control = vehicle["control_rel_init"]
    current_struct = vehicle["struct_rel_init"]
    
    current_value = 30000
    cumulative_mileage = 0
    removed = False
    
    for year in range(manufacture_year, CONFIG["end_year"] + 1):
        
        if removed:
            break
        
        age = year - manufacture_year
        
        region_data = regional_environment_df[
            (regional_environment_df["country"] == vehicle["initial_country"]) &
            (regional_environment_df["year"] == year)
        ].iloc[0]
        
        stress = region_data["regional_risk_index"]
        maintenance = region_data["maintenance_culture"]
        
        oem_cost = 0
        customer_cost = 0
        insurance_claim = 0
        
        annual_mileage = max(5000, np.random.normal(15000, 3000))
        cumulative_mileage += annual_mileage
        
        # Degradation
        current_power = max(0, current_power - 0.02 * stress)
        current_motion = max(0, current_motion - 0.015 * stress)
        current_control = max(0, current_control - 0.01 * stress)
        current_struct = max(0, current_struct - 0.01 * stress)
        
        repair_cost = 0
        
        if np.random.rand() < failure_probability(current_power, stress, age, maintenance):
            cost = 2000
            repair_cost += cost
            if under_warranty(manufacture_year, year, cumulative_mileage):
                oem_cost += cost
            else:
                customer_cost += cost
        
        if np.random.rand() < failure_probability(current_motion, stress, age, maintenance):
            repair_cost += 1500
        
        if np.random.rand() < failure_probability(current_control, stress, age, maintenance):
            repair_cost += 1000
        
        if np.random.rand() < failure_probability(current_struct, stress, age, maintenance):
            repair_cost += 1200
        
        # Accident
        if np.random.rand() < accident_probability(stress, region_data["traffic_density"], age):
            severity = accident_severity()
            
            if severity == 1:
                insurance_claim = 1000
                current_value *= 0.97
            elif severity == 2:
                insurance_claim = 4000
                current_value *= 0.90
            elif severity == 3:
                insurance_claim = 10000
                current_value *= 0.50
                removed = True
            
            accident_rows.append({
                "vehicle_id": vehicle_id,
                "year": year,
                "severity": severity,
                "insurance_claim": insurance_claim
            })
        
        depreciation_rate = 0.08 + (1 - np.mean([
            current_power,
            current_motion,
            current_control,
            current_struct
        ])) * 0.2
        
        current_value *= (1 - depreciation_rate)
        
        lifecycle_rows.append({
            "vehicle_id": vehicle_id,
            "year": year,
            "age": age,
            "power_health": current_power,
            "motion_health": current_motion,
            "control_health": current_control,
            "struct_health": current_struct,
            "repair_cost": repair_cost,
            "oem_cost": oem_cost,
            "customer_cost": customer_cost,
            "insurance_claim": insurance_claim,
            "cumulative_mileage": cumulative_mileage,
            "market_value": current_value
        })


In [9]:
vehicle_lifecycle_df = pd.DataFrame(lifecycle_rows)
accident_events_df = pd.DataFrame(accident_rows)

vehicle_lifecycle_df["failure_this_year"] = (
    vehicle_lifecycle_df["repair_cost"] > 0
).astype(int)

final_df = vehicle_lifecycle_df.merge(
    vehicle_master_df[
        ["vehicle_id", "brand", "initial_country",
         "manufacturing_year", "fuel_type", "vehicle_class"]
    ],
    on="vehicle_id",
    how="left"
)

final_df = final_df.rename(columns={"initial_country": "country"})


In [10]:
final_df.to_csv("../data/vehicle_yearly_v1.csv", index=False)

print("Rows:", final_df.shape[0])
print("Columns:", final_df.shape[1])


Rows: 776664
Columns: 19


In [11]:
final_df.isnull().sum()

vehicle_id            0
year                  0
age                   0
power_health          0
motion_health         0
control_health        0
struct_health         0
repair_cost           0
oem_cost              0
customer_cost         0
insurance_claim       0
cumulative_mileage    0
market_value          0
failure_this_year     0
brand                 0
country               0
manufacturing_year    0
fuel_type             0
vehicle_class         0
dtype: int64