In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 


import arff
data_freq = arff.load('freMTPL2freq.arff') 
df_freq = pd.DataFrame(data_freq, columns=["IDpol", "ClaimNb", "Exposure", "Area", "VehPower", "VehAge","DrivAge", "BonusMalus", "VehBrand", "VehGas", "Density", "Region"])
df_freq.head()
df_freq.info()
df_freq.describe()

duplicates = df_freq[df_freq.duplicated(subset=['IDpol'], keep=False)]
print(duplicates.head())

data_sev = arff.load('freMTPL2sev.arff') 
df_sev = pd.DataFrame(data_sev, columns=["IDpol", "PurePremium"])
df_sev.head()
df_sev.info()
df_sev.describe()


duplicates = df_sev[df_sev.duplicated(subset=['IDpol'], keep=False)]
print(duplicates.head())
duplicates.info()
## I see 3068 rows of duplicates

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678013 entries, 0 to 678012
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   IDpol       678013 non-null  float64
 1   ClaimNb     678013 non-null  float64
 2   Exposure    678013 non-null  float64
 3   Area        678013 non-null  object 
 4   VehPower    678013 non-null  float64
 5   VehAge      678013 non-null  float64
 6   DrivAge     678013 non-null  float64
 7   BonusMalus  678013 non-null  float64
 8   VehBrand    678013 non-null  object 
 9   VehGas      678013 non-null  object 
 10  Density     678013 non-null  float64
 11  Region      678013 non-null  object 
dtypes: float64(8), object(4)
memory usage: 62.1+ MB
Empty DataFrame
Columns: [IDpol, ClaimNb, Exposure, Area, VehPower, VehAge, DrivAge, BonusMalus, VehBrand, VehGas, Density, Region]
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26639 entries, 0 to 26638
Data columns (total 2 columns)

In [2]:
df_sev_updated = df_sev.groupby('IDpol')['PurePremium'].sum().reset_index()
# 24950 rows × 2 columns

# Perform an inner Join to merge the two dataframes
inner_join_df = pd.merge(df_freq, df_sev_updated, on='IDpol', how='inner')

combined_df = inner_join_df.copy()

print(combined_df.info())
combined_df.head()
combined_df.info()
combined_df.describe()


# Create the new column "PurePremium" by dividing "PurePremium" by "Exposure"
combined_df['PurePremium'] = combined_df['PurePremium'] / combined_df['Exposure']

# Create the new column "Frequency" by dividing "ClaimNb" by "Exposure"
combined_df['Frequency'] = combined_df['ClaimNb'] / combined_df['Exposure']

# Remove the columns 
combined_df.pop('IDpol')
combined_df.pop('ClaimNb')
combined_df.pop('Exposure')
combined_df.info()


y_premium_column = combined_df.pop('PurePremium')
combined_df['PurePremium'] = y_premium_column


<class 'pandas.core.frame.DataFrame'>
Int64Index: 24944 entries, 0 to 24943
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   IDpol        24944 non-null  float64
 1   ClaimNb      24944 non-null  float64
 2   Exposure     24944 non-null  float64
 3   Area         24944 non-null  object 
 4   VehPower     24944 non-null  float64
 5   VehAge       24944 non-null  float64
 6   DrivAge      24944 non-null  float64
 7   BonusMalus   24944 non-null  float64
 8   VehBrand     24944 non-null  object 
 9   VehGas       24944 non-null  object 
 10  Density      24944 non-null  float64
 11  Region       24944 non-null  object 
 12  PurePremium  24944 non-null  float64
dtypes: float64(9), object(4)
memory usage: 2.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24944 entries, 0 to 24943
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   IDpol 

In [3]:

import numpy as np

# Calculate the Z-score for 'PurePremium' column
z_scores = np.abs((combined_df['PurePremium'] - combined_df['PurePremium'].mean()) / combined_df['PurePremium'].std())

# Define the threshold for outliers (e.g., Z-score greater than 3)
threshold = 3

# Find the indices of outliers
outlier_indices = np.where(z_scores > threshold)[0]

# Create a DataFrame containing only the outliers
df_outliers = combined_df.iloc[outlier_indices]

df_scaled_cleaned = combined_df.drop(outlier_indices)

combined_df = df_scaled_cleaned.copy()
combined_df.info()

# Log transform the 'PurePremium' column
combined_df['PurePremium'] = np.log(combined_df['PurePremium'])

print(combined_df.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24899 entries, 0 to 24943
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         24899 non-null  object 
 1   VehPower     24899 non-null  float64
 2   VehAge       24899 non-null  float64
 3   DrivAge      24899 non-null  float64
 4   BonusMalus   24899 non-null  float64
 5   VehBrand     24899 non-null  object 
 6   VehGas       24899 non-null  object 
 7   Density      24899 non-null  float64
 8   Region       24899 non-null  object 
 9   Frequency    24899 non-null  float64
 10  PurePremium  24899 non-null  float64
dtypes: float64(7), object(4)
memory usage: 2.3+ MB
           VehPower        VehAge       DrivAge    BonusMalus       Density  \
count  24899.000000  24899.000000  24899.000000  24899.000000  24899.000000   
mean       6.469577      7.384554     45.150649     64.917708   1983.996546   
std        2.013247      5.173428     14.651704     19.8

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Features to be Normalised
numerical_features = ['Frequency', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']

# Features to be encoded
columns_to_encode = ['Area', 'VehBrand', 'VehGas', 'Region']

# Copy the DataFrame to avoid modifying the original
data = combined_df.copy()

# Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler())                     # Scale the features
])

# Apply the pipeline to numerical features
data[numerical_features] = pipeline.fit_transform(data[numerical_features])

import pickle

# Save the scaler used during training
with open('scaler.pkl', 'wb') as file:
    pickle.dump(pipeline.named_steps['scaler'], file)

# Apply one-hot encoding
data = pd.get_dummies(data, columns=columns_to_encode, drop_first=True)


y_premium_column = data.pop('PurePremium')
data['PurePremium'] = y_premium_column

df = data.copy()
df.info()
df.describe()
print("Missing values:\n", df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24899 entries, 0 to 24943
Data columns (total 44 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   VehPower        24899 non-null  float64
 1   VehAge          24899 non-null  float64
 2   DrivAge         24899 non-null  float64
 3   BonusMalus      24899 non-null  float64
 4   Density         24899 non-null  float64
 5   Frequency       24899 non-null  float64
 6   Area_'B'        24899 non-null  uint8  
 7   Area_'C'        24899 non-null  uint8  
 8   Area_'D'        24899 non-null  uint8  
 9   Area_'E'        24899 non-null  uint8  
 10  Area_'F'        24899 non-null  uint8  
 11  VehBrand_'B10'  24899 non-null  uint8  
 12  VehBrand_'B11'  24899 non-null  uint8  
 13  VehBrand_'B12'  24899 non-null  uint8  
 14  VehBrand_'B13'  24899 non-null  uint8  
 15  VehBrand_'B14'  24899 non-null  uint8  
 16  VehBrand_'B2'   24899 non-null  uint8  
 17  VehBrand_'B3'   24899 non-null 

In [6]:
# Removing Outliers from numerical features
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing the data
columns_of_interest = ['Frequency', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']

# Step 2: Z-Score Method
from scipy import stats

z_scores = np.abs(stats.zscore(df[columns_of_interest]))
threshold = 3
outliers_z = np.where(z_scores > threshold)

# Step 3: IQR Method
Q1 = df[columns_of_interest].quantile(0.25)
Q3 = df[columns_of_interest].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr = ((df[columns_of_interest] < lower_bound) | (df[columns_of_interest] > upper_bound)).any(axis=1)

# Combine results
outliers = set(outliers_z[0]) | set(outliers_iqr.index[outliers_iqr])

# Remove outliers identified by IQR method
df_no_outliers_iqr = df[~outliers_iqr]
df = df_no_outliers_iqr.copy()



In [22]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error, explained_variance_score

# Assuming X contains features and y contains the target variable 'PurePremium'
X = df.drop(columns=['PurePremium'])
y = df['PurePremium']

# Splitting the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Define hyperparameters for XGBoost
param_grid = {
    'n_estimators': [10, 20, 30, 40],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
}

# Initialize XGBoost regressor
xgb_reg = XGBRegressor(random_state=0)

# Perform random search with cross-validation
random_search = RandomizedSearchCV(estimator=xgb_reg, param_distributions=param_grid, 
                                   n_iter=50, scoring='neg_mean_squared_error', cv=5, 
                                   verbose=2, random_state=42, n_jobs=-1)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_estimator = random_search.best_estimator_
best_params = random_search.best_params_

print("Best Hyperparameters:", best_params)

import pickle
# Save the trained model to a pickle file
with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(best_estimator, file)

# Predictions on the validation set
y_pred_val = np.exp(best_estimator.predict(X_val))  # Perform inverse log transformation

# Calculate evaluation metrics for validation set
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_val, y_pred_val)
medae_val = median_absolute_error(y_val, y_pred_val)
explained_var_val = explained_variance_score(y_val, y_pred_val)

# Predictions on the test set
y_pred_test = np.exp(best_estimator.predict(X_test))  # Perform inverse log transformation

# Calculate evaluation metrics for test set
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred_test)
medae_test = median_absolute_error(y_test, y_pred_test)
explained_var_test = explained_variance_score(y_test, y_pred_test)

# Predictions on the training set
y_pred_train = np.exp(best_estimator.predict(X_train))  # Perform inverse log transformation

# Calculate evaluation metrics for training set
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_pred_train)
medae_train = median_absolute_error(y_train, y_pred_train)
explained_var_train = explained_variance_score(y_train, y_pred_train)



Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'subsample': 0.9, 'n_estimators': 30, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.8}


## Predicting the Premium for Input Data as follows: Frequency calculated separately for ClaimNb = 1	Exposure = 0.17
#### 'Frequency': [5.88],
#### 'Area': ['C'],
####  'VehPower': [5],
####  'VehAge': [5],
####  'DrivAge': [41],
####  'BonusMalus': [68],
####  'VehBrand': ['B4'],
####  'VehGas': ['Diesel'],
####  'Density': [160],
####  'Region': ['R23']



In [20]:
import pandas as pd
import pickle

# Load the trained model from the pickle file
with open('xgb_model.pkl', 'rb') as file:
    trained_model = pickle.load(file)

# Load the trained model from the pickle file
with open('xgb_model.pkl', 'rb') as file:
    trained_model = pickle.load(file)

# Load the scaler used during training
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

# Define the new data
new_data = pd.DataFrame({
    'Frequency': [5.88],
    'Area': ['C'],
    'VehPower': [5],
    'VehAge': [5],
    'DrivAge': [41],
    'BonusMalus': [68],
    'VehBrand': ['B4'],
    'VehGas': ['Diesel'],
    'Density': [160],
    'Region': ['R23']
})

# Encode categorical variables
new_data = pd.get_dummies(new_data, columns=['Area', 'VehBrand', 'VehGas', 'Region'])

# Scale numerical features using the same StandardScaler instance from training
numerical_features = ['Frequency', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
new_data[numerical_features] = scaler.transform(new_data[numerical_features])

# Rearrange columns to match the order of the columns in the training data
new_data = new_data.reindex(columns=df.columns[:-1], fill_value=0)

# Predict PurePremium using the trained model
predicted_pure_premium = trained_model.predict(new_data)

# Perform inverse log transformation
predicted_pure_premium = np.exp(predicted_pure_premium)

# Print the predicted PurePremium
print("\nPredicted PurePremium:", predicted_pure_premium)


Predicted PurePremium: [5802.4556]




## Predicted Premium to be Paid: 5802.4556