# Import the necessary libraries

In [8]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_data import Load_Data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from lime.lime_tabular import LimeTabularExplainer
from sklearn.linear_model import LinearRegression


# Load data

In [9]:
# Create an instance of CSVReader
csv_reader = Load_Data('../data/cleanded_data_set.csv')

# Load the data
csv_reader.load_data()

# Get the loaded data
df = csv_reader.get_data()

# Now, you can use raw_data as needed

  self.data = pd.read_csv(self.file_path)


Data successfully loaded from ../data/cleanded_data_set.csv


In [97]:
print(df.isnull().sum())

UnderwrittenCoverID               0
PolicyID                          0
TransactionMonth                  0
IsVATRegistered                   0
Citizenship                       0
LegalType                         0
Title                             0
Language                          0
Bank                              0
AccountType                       0
MaritalStatus                     0
Gender                            0
Country                           0
Province                          0
PostalCode                        0
MainCrestaZone                    0
SubCrestaZone                     0
ItemType                          0
mmcode                            0
VehicleType                       0
RegistrationYear                  0
make                              0
Model                             0
Cylinders                         0
cubiccapacity                     0
kilowatts                         0
bodytype                          0
NumberOfDoors               

# Categorical and Numerical columns

In [10]:
# Separate categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['number']).columns

# Display the lists of categorical and numerical columns
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)

total_columns = df.shape[1]

# Print the result
print(f'Total number of columns: {total_columns}')


Categorical columns: Index(['TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'VehicleType', 'make',
       'Model', 'bodytype', 'VehicleIntroDate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'TermFrequency',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType'],
      dtype='object')
Numerical columns: Index(['UnderwrittenCoverID', 'PolicyID', 'PostalCode', 'mmcode',
       'RegistrationYear', 'Cylinders', 'cubiccapacity', 'kilowatts',
       'NumberOfDoors', 'CustomValueEstimate', 'NumberOfVehiclesInFleet',
       'SumInsured', 'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims',
       'Margin', 'IsProfitable'],
      dtype='object')
Total number of

In [11]:
# Drop the unwanted columns
df = df.drop(columns=['WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet'])

# Confirm the columns are removed
print(df.columns)


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'SumInsured',
       'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected',
       'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product',
       'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims',
       'Margin', 'IsProfitable'],
      dtype='object')


# Encoding Categorical data

In [12]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
label_enc_cols = ['TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType',
                  'MaritalStatus', 'Gender', 'Country', 'Province', 'MainCrestaZone', 'SubCrestaZone', 'ItemType',
                  'VehicleType', 'make', 'Model', 'bodytype', 'VehicleIntroDate', 'AlarmImmobiliser',
                  'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'TermFrequency', 'ExcessSelected',
                  'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType']

# Iterate over the columns to perform label encoding
for col in label_enc_cols:
    # Convert all data in the column to strings to avoid mixed data types
    df[col] = df[col].astype(str)
    
    # Handle missing or undefined values (optional but recommended)
    df[col] = df[col].fillna('unknown')
    
    # Apply label encoding
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Now your categorical columns are uniformly encoded.
print(df.head())


   UnderwrittenCoverID  PolicyID  TransactionMonth  IsVATRegistered  \
0               145249     12827                17             True   
1               145249     12827                19             True   
2               145249     12827                21             True   
3               145255     12827                19             True   
4               145255     12827                21             True   

   Citizenship  LegalType  Title  Language  Bank  AccountType  ...  CoverType  \
0            0          0      2         0     2            0  ...         21   
1            0          0      2         0     2            0  ...         21   
2            0          0      2         0     2            0  ...         21   
3            0          0      2         0     2            0  ...         13   
4            0          0      2         0     2            0  ...         13   

   CoverGroup  Section  Product  StatutoryClass  StatutoryRiskType  \
0           5   

In [13]:
# Check for non-numeric columns in X_train
non_numeric_columns = X_train.select_dtypes(include=['object']).columns

print(f"Non-numeric columns: {non_numeric_columns}")




Non-numeric columns: Index(['TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'VehicleType', 'make',
       'Model', 'bodytype', 'VehicleIntroDate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'TermFrequency',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType'],
      dtype='object')


# Splitting data 

In [14]:

# Assuming you've calculated 'Margin' as:
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# Select features (X) and target (y)
# Features should include all encoded categorical and numerical columns except the target columns
# We'll drop only the target columns (TotalPremium, TotalClaims, Margin) from the features
X = df.drop(['TotalPremium', 'TotalClaims', 'Margin'], axis=1)  # Features include encoded categorical and numerical data
y = df['Margin']  # Target

# Split the data (using 80% for training and 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes to verify the split
print(f"Training data: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing data: X_test: {X_test.shape}, y_test: {y_test.shape}")




Training data: X_train: (800078, 46), y_train: (800078,)
Testing data: X_test: (200020, 46), y_test: (200020,)


 # Model Building - Linear Regression

In [17]:
linear_model = LinearRegression()

# Train the model using X_train and y_train
linear_model.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = linear_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-Squared: {r2}")


Mean Squared Error: 4828022.6916774465
R-Squared: 0.0044567883495806315


# Model Building - Random Forest

In [16]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions using the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R-Squared: {r2_rf}")

Random Forest Mean Squared Error: 4837494.983934639
Random Forest R-Squared: 0.002503592008628708


# Model Building - XGBoost

In [19]:
import xgboost as xgb

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions using the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Squared Error: {mse_xgb}")
print(f"XGBoost R-Squared: {r2_xgb}")


XGBoost Mean Squared Error: 5039523.989361433
XGBoost R-Squared: -0.039154995316537766


# Decision tree

In [31]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Define and Train the Decision Tree Model
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)  # Train the Decision Tree model

# 2. Make predictions using the testing set
y_pred = dt_reg.predict(X_test)

# 3. Evaluate the model
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-Squared

# 4. Print the evaluation results
print(f"Decision Tree Mean Squared Error: {mse}")
print(f"Decision Tree R-Squared: {r2}")


Decision Tree Mean Squared Error: 9349048.072277302
Decision Tree R-Squared: -0.9277832641079313


# Model Comparison


In [32]:
print(f"Linear Regression MSE: {mse}, R²: {r2}")
print(f"Random Forest MSE: {mse_rf}, R²: {r2_rf}")
print(f"XGBoost MSE: {mse_xgb}, R²: {r2_xgb}")
print(f"XGBoost MSE: {mse_xgb}, R²: {r2_xgb}")


Linear Regression MSE: 9349048.072277302, R²: -0.9277832641079313
Random Forest MSE: 4837494.983934639, R²: 0.002503592008628708
XGBoost MSE: 5039523.989361433, R²: -0.039154995316537766
XGBoost MSE: 5039523.989361433, R²: -0.039154995316537766
