In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost
import shap
import lime
import lime.lime_tabular




file_path = '../data/MachineLearningRating_v3.txt'

df = pd.read_csv(file_path, sep='|')

# Explore the dataset
print(df.head())


  df = pd.read_csv(file_path, sep='|')


   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...             Mobility - 

In [4]:
# Define target and features
X = df.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = df[['TotalPremium', 'TotalClaims']]

# Handling missing data and feature engineering
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
# Ensure consistent data types in columns
for col in X.columns:
    if X[col].dtype == 'object':
        # Check if there are numeric values in categorical columns
        try:
            X[col] = X[col].astype(float)  # Convert to float if possible
        except ValueError:
            pass  # If conversion fails, it remains as a string (categorical)

# Alternatively, you can convert any mixed-type column explicitly to a string
X[categorical_features] = X[categorical_features].astype(str)

# Now the rest of the preprocessing pipeline should work as expected


In [8]:
import xgboost as xgb

# Define target and features
X = df.drop(['TotalPremium', 'TotalClaims'], axis=1)  # Feature variables
y = df[['TotalPremium', 'TotalClaims']]               # Target variables

# Ensure consistent data types in columns (prevent mixed types)
for col in X.columns:
    if X[col].dtype == 'object':
        # Attempt to convert numeric-like strings to floats, otherwise keep as categorical
        try:
            X[col] = X[col].astype(float)
        except ValueError:
            pass  # Keep as string if conversion fails

# Make sure all categorical features are treated as strings
X[categorical_features] = X[categorical_features].astype(str)

# Handling missing data and feature engineering
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),    # Impute missing numeric values with median
    ('scaler', StandardScaler())                     # Standardize numeric features
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Impute missing categorical values with 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine preprocessing steps into one ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models and create pipelines for each
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                         ('regressor', LinearRegression())]),
    
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100))]),
    
    'XGBoost': Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', xgb.XGBRegressor(objective='reg:squarederror'))])
}

# Train models and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    results[name] = score
    print(f'{name} R^2 score: {score:.4f}')

# Display final model results
print("\nModel Results:", results)



TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['float', 'int', 'str']

In [6]:
print(X_train.dtypes)

UnderwrittenCoverID           int64
PolicyID                      int64
TransactionMonth             object
IsVATRegistered                bool
Citizenship                  object
LegalType                    object
Title                        object
Language                     object
Bank                         object
AccountType                  object
MaritalStatus                object
Gender                       object
Country                      object
Province                     object
PostalCode                    int64
MainCrestaZone               object
SubCrestaZone                object
ItemType                     object
mmcode                      float64
VehicleType                  object
RegistrationYear              int64
make                         object
Model                        object
Cylinders                   float64
cubiccapacity               float64
kilowatts                   float64
bodytype                     object
NumberOfDoors               

In [None]:
from sklearn.metrics import mean_squared_error

# Evaluate models
for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'{name} Mean Squared Error: {mse:.4f}')


In [None]:
# Feature importance for RandomForest and XGBoost
importances = {}
for name, model in models.items():
    if name in ['Random Forest', 'XGBoost']:
        # Extract feature importances
        model.fit(X_train, y_train)  # Re-fit to ensure feature importance is available
        importances[name] = model.named_steps['regressor'].feature_importances_

# For SHAP values (Example with XGBoost)
explainer = shap.Explainer(models['XGBoost'].named_steps['regressor'])
shap_values = explainer(X_test)

# Plot SHAP values
shap.summary_plot(shap_values, X_test)


In [None]:
# Report comparison between models
import matplotlib.pyplot as plt

# Plotting feature importances
plt.figure(figsize=(10, 6))
for name, importance in importances.items():
    plt.plot(importance, label=name)
plt.title('Feature Importances')
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.legend()
plt.show()

# Print overall comparison
print("Model Comparison Report:")
for name, score in results.items():
    print(f"{name}: R^2 score: {score:.4f}")
