In [1]:
# Import necessary libraries
import pandas as pd
import os

# # Load the dataset
# Relative path to the CSV file
file_path = "../Dataset/Synthetic_Data_For_Students.csv"

# Read the dataset
data = pd.read_csv(file_path)

# Display the first few rows
data.head(3)

Unnamed: 0,SettlementValue,AccidentType,Injury_Prognosis,SpecialHealthExpenses,SpecialReduction,SpecialOverage,GeneralRest,SpecialAdditionalInjury,SpecialEarningsLoss,SpecialUsageLoss,...,Accident Date,Claim Date,Vehicle Age,Driver Age,Number of Passengers,Accident Description,Injury Description,Police Report Filed,Witness Present,Gender
0,520.0,Rear end,E. 5 months,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2023-11-10 11:22:24.508901,2024-06-11 11:22:24.508901,13.0,33.0,4.0,Side collision at an intersection.,Whiplash and minor bruises.,Yes,Yes,Male
1,870.0,Rear end,B. 2 months,0.0,0.0,0.0,520.0,0.0,0.0,90.0,...,2023-06-25 00:55:01.140228,2024-01-09 00:55:01.140228,4.0,45.0,2.0,Side collision at an intersection.,Minor cuts and scrapes.,Yes,Yes,Female
2,2140.0,Other side pulled out of side road,G. 7 months,0.0,0.0,0.0,1400.0,0.0,0.0,0.0,...,2020-02-23 17:43:47.805561,2020-03-01 17:43:47.805561,9.0,45.0,4.0,Lost control on a snowy road.,Whiplash and minor bruises.,Yes,No,Female


## Data Preprocessing

In [2]:
# 1. Data Preprocessing

# Drop datetime columns if they exist
datetime_columns = ["Accident Date", "Claim Date"]
data = data.drop(columns=[col for col in datetime_columns if col in data.columns])

# Convert categorical columns to numerical
categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values if any
data = data.fillna(0)

# Scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = data.drop(columns=["SettlementValue"])
features_scaled = scaler.fit_transform(features)

# Prepare final data
X = pd.DataFrame(features_scaled, columns=features.columns)
y = data["SettlementValue"]

print("Data preprocessing complete.")



Data preprocessing complete.


## Train Test Split

In [3]:
# 2. Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")



Training set shape: (4000, 79), Testing set shape: (1000, 79)


## Train Random Forest 

In [4]:
# 3. Train Random Forest Regressor (Separate Model)

from sklearn.ensemble import RandomForestRegressor

# Define and train Random Forest
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

print("Random Forest training complete.")


Random Forest training complete.


## Evaluate Random Forest Regressor 

In [5]:
# 4. Evaluate Random Forest Regressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Predict
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = (abs(y_test - y_pred_rf) / y_test).mean() * 100
rmse_rf = mse_rf**0.5

# Print evaluation
print("--- Random Forest Regressor ---")
print(f"Mean Absolute Error (MAE): £{mae_rf:,.2f}")
print(f"Root Mean Squared Error (RMSE): £{rmse_rf:,.2f}")
print(f"R² Score: {r2_rf:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_rf:.2f}%")


--- Random Forest Regressor ---
Mean Absolute Error (MAE): £174.40
Root Mean Squared Error (RMSE): £434.93
R² Score: 0.7649
Mean Absolute Percentage Error (MAPE): inf%


##  Define Base Models for Stacking

In [6]:
# 5. Define Base Models for Stacking

from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import StackingRegressor

# Base models
base_models = [
    ('rf', RandomForestRegressor(random_state=42, n_estimators=100)),
    ('svr', SVR()),
    ('lasso', Lasso(alpha=0.1))
]

# Final estimator
final_estimator = Ridge(alpha=1.0)


## Build Stacking Regressor 

In [7]:
# 6. Build Stacking Regressor

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=final_estimator,
    cv=5
)

print("Stacking Regressor built successfully.")


Stacking Regressor built successfully.


## Train Stacking Regressor

In [8]:
# 7. Train Stacking Regressor

stacking_model.fit(X_train, y_train)

print("Stacking Regressor training complete.")


Stacking Regressor training complete.


## Evaluate Stacking Regressor

In [9]:
# 8. Evaluate Stacking Regressor

# Predict
y_pred_stack = stacking_model.predict(X_test)

# Calculate metrics
mse_stack = mean_squared_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)
mae_stack = mean_absolute_error(y_test, y_pred_stack)
mape_stack = (abs(y_test - y_pred_stack) / y_test).mean() * 100
rmse_stack = mse_stack**0.5

# Print evaluation
print("--- Stacking Regressor ---")
print(f"Mean Absolute Error (MAE): £{mae_stack:,.2f}")
print(f"Root Mean Squared Error (RMSE): £{rmse_stack:,.2f}")
print(f"R² Score: {r2_stack:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_stack:.2f}%")


--- Stacking Regressor ---
Mean Absolute Error (MAE): £176.92
Root Mean Squared Error (RMSE): £416.71
R² Score: 0.7842
Mean Absolute Percentage Error (MAPE): inf%


## Save Models and Results 

In [10]:
# 9. Save Models and Results

import os
import pickle
import pandas as pd

os.makedirs('New_Models', exist_ok=True)

# Save Random Forest model
with open('New_Models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# Save Stacking model
with open('New_Models/stacking_model.pkl', 'wb') as f:
    pickle.dump(stacking_model, f)

# Save predictions
results = pd.DataFrame({
    'Actual': y_test,
    'RandomForest_Predicted': y_pred_rf,
    'Stacking_Predicted': y_pred_stack
})
results.to_csv('New_Models/model_predictions.csv', index=False)

# Save evaluation metrics
metrics = {
    'Random Forest': {
        'MAE (£)': round(mae_rf, 2),
        'RMSE (£)': round(rmse_rf, 2),
        'R² Score': round(r2_rf, 4),
        'MAPE (%)': round(mape_rf, 2)
    },
    'Stacking': {
        'MAE (£)': round(mae_stack, 2),
        'RMSE (£)': round(rmse_stack, 2),
        'R² Score': round(r2_stack, 4),
        'MAPE (%)': round(mape_stack, 2)
    }
}
with open('New_Models/evaluation_metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)

print("Models and results saved successfully.")


Models and results saved successfully.
