In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt

# Load your dataset (replace 'data.csv' with your actual data file)
df = pd.read_csv('C:\\Users\\Toshiba\\Documents\\Kifiya\\week 3\\Data\\converted_data.csv')

# ======================================
# Step 1: Data Preparation
# ======================================

# Handling missing data: Impute missing values with median for numerical and mode for categorical features
# Apply median imputation only to numeric columns
df.fillna(df.select_dtypes(include='number').median(), inplace=True)

# For non-numeric columns (e.g., categorical, datetime), you can impute using mode (most frequent value)
df.fillna(df.select_dtypes(exclude='number').mode().iloc[0], inplace=True)

# Ensure the column is in datetime format (in case it's not)
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])

# Extract year, month, and day from TransactionMonth
df['TransactionYear'] = df['TransactionMonth'].dt.year
df['TransactionMonthOnly'] = df['TransactionMonth'].dt.month
df['TransactionDay'] = df['TransactionMonth'].dt.day

df = df.drop(columns=['VehicleIntroDate'])



# Feature Engineering: Create a new feature (e.g., ClaimsRatio: TotalClaims / TotalPremium)
df['ClaimsRatio'] = df['TotalClaims'] / df['TotalPremium']


# Limit the number of unique categories for high-cardinality features like PostalCode
top_n = 100  # Set a threshold for how many top categories you want to keep
df['PostalCode'] = df['PostalCode'].apply(lambda x: x if x in df['PostalCode'].value_counts().index[:top_n] else 'Other')
# Encoding categorical data: One-Hot Encoding for categorical variables like Province, Gender, PostalCode
df_encoded = pd.get_dummies(df, columns=['Province', 'Gender', 'PostalCode'])

# Splitting features and labels
X = df_encoded.drop(columns=['TotalPremium', 'TotalClaims'])  # Features
y = df_encoded['TotalPremium']  # Target variable: TotalPremium (you can also change to TotalClaims)

# Train-Test Split (80% training and 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ======================================
# Step 2: Model Building
# ======================================

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# XGBoost Regressor Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# ======================================
# Step 3: Model Evaluation
# ======================================

# Function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}\n")

# Evaluate Linear Regression
evaluate_model(lr_model, X_test, y_test, "Linear Regression")

# Evaluate Random Forest
evaluate_model(rf_model, X_test, y_test, "Random Forest")

# Evaluate XGBoost
evaluate_model(xgb_model, X_test, y_test, "XGBoost")

# ======================================
# Step 4: Feature Importance Analysis
# ======================================

# Feature Importance for Random Forest
rf_importances = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importances})
rf_importance_df = rf_importance_df.sort_values(by='Importance', ascending=False)
print("\nRandom Forest Feature Importances:\n", rf_importance_df)

# Feature Importance for XGBoost
xgb_importances = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_importances})
xgb_importance_df = xgb_importance_df.sort_values(by='Importance', ascending=False)
print("\nXGBoost Feature Importances:\n", xgb_importance_df)

# Plotting Feature Importance for XGBoost
plt.figure(figsize=(10, 6))
plt.barh(xgb_importance_df['Feature'], xgb_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importance')
plt.show()

# ======================================
# Step 5: Model Interpretability using SHAP
# ======================================``

# Initialize SHAP for XGBoost
explainer = shap.Explainer(xgb_model)
shap_values = explainer.shap_values(X_test)

# SHAP Summary Plot
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X_test.columns)

# SHAP Dependence Plot for a specific feature (example: 'ClaimsRatio')
shap.dependence_plot('ClaimsRatio', shap_values, X_test, feature_names=X_test.columns)

# SHAP Force Plot for a single instance
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:], feature_names=X_test.columns)

