<a href="https://colab.research.google.com/github/thomasdanielchiwai/Data-Science-Ass-2/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

df = pd.read_csv("/content/synthetic_accidents.csv")

print("Rows, cols:", df.shape)
print(df.head())

severity_map = {"Slight": 0, "Serious": 1, "Fatal": 2}
if df['severity'].dtype == 'object':
    df['severity_num'] = df['severity'].map(severity_map)
else:
    df['severity_num'] = df['severity']

features = [
    'vehicle_speed_limit', 'road_type', 'weather_condition',
    'light_condition', 'junction_detail', 'vehicle_type',
    'age_of_driver', 'alcohol_involved', 'num_of_vehicles', 'road_surface_condition'
]
target = 'severity_num'

df = df.dropna(subset=[target])

numeric_features = ['vehicle_speed_limit', 'age_of_driver', 'num_of_vehicles']
categorical_features = ['road_type', 'weather_condition', 'light_condition',
                        'junction_detail', 'vehicle_type', 'alcohol_involved',
                        'road_surface_condition']

for col in numeric_features:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)

for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype(str).fillna("Unknown")

features = [f for f in features if f in df.columns]
numeric_features = [f for f in numeric_features if f in df.columns]
categorical_features = [f for f in categorical_features if f in df.columns]

X = df[features]
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ], remainder='drop'
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2: {r2:.4f}")

joblib.dump(model_pipeline, "accident_severity_linear_model.joblib")
print("Saved model to accident_severity_linear_model.joblib")

Rows, cols: (5000, 12)
   vehicle_speed  driver_age  alcohol_level traffic_density weather  \
0      67.450712   29.914884       0.077502          medium   clear   
1      57.926035   29.559031       0.021744          medium   clear   
2      69.715328   16.000000       0.014903             low    snow   
3      82.845448   31.038918       0.019804          medium    rain   
4      56.487699   43.793949       0.174871          medium   clear   

       road_light vehicle_type  seatbelt road_surface road_alignment  hour  \
0        daylight          car         1          dry       straight    19   
1        daylight          car         1          dry       straight     7   
2        daylight          car         1          dry       straight     8   
3  dark_lights_on          car         0          dry       straight    13   
4        daylight          car         1          dry          curve    11   

    severity  
0  42.478089  
1  47.221452  
2  67.225206  
3  56.528118  
4  34.