In [36]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [37]:
data = pd.read_csv('data/IR_raw_data.csv')
data = data.sample(10000, random_state=13)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 549982 to 247927
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10000 non-null  int64  
 1   Age                   9850 non-null   float64
 2   Gender                10000 non-null  object 
 3   Annual Income         9607 non-null   float64
 4   Marital Status        9838 non-null   object 
 5   Number of Dependents  9097 non-null   float64
 6   Education Level       10000 non-null  object 
 7   Occupation            7002 non-null   object 
 8   Health Score          9384 non-null   float64
 9   Location              10000 non-null  object 
 10  Policy Type           10000 non-null  object 
 11  Previous Claims       6968 non-null   float64
 12  Vehicle Age           10000 non-null  float64
 13  Credit Score          8874 non-null   float64
 14  Insurance Duration    10000 non-null  float64
 15  Policy Start Date 

In [38]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [39]:
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

In [40]:
numerical_cols = data.select_dtypes(exclude='object').drop(['Premium Amount','id'], axis = 1).columns
categorical_cols = data.select_dtypes(include = 'object').columns

In [41]:
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

In [42]:
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [43]:
colsfordummies = data.select_dtypes(include='object').drop(['Policy Start Date'], axis=1).columns
data = pd.get_dummies(data,columns=colsfordummies, drop_first=True)
bool_cols = data.select_dtypes(include='bool').columns
data[bool_cols] = data[bool_cols].astype('float32')


In [None]:
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
data['Policy Age'] = (pd.to_datetime('2025-01-01') - data['Policy Start Date']).dt.days
data.drop(['Policy Start Date','id'], axis = 1, inplace=True)

In [45]:
data.head(5)

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount,Gender_Male,...,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Smoking Status_Yes,Exercise Frequency_Monthly,Exercise Frequency_Rarely,Exercise Frequency_Weekly,Property Type_Condo,Property Type_House,Policy Age
549982,0.130142,0.010664,-1.473089,-0.50239,-1.211843,1.102336,0.240305,0.752139,29.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,508
661271,0.204007,1.52575,0.005619,-1.518146,1.200985,-0.962769,-0.97107,0.368042,40.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1179
95378,-0.608505,-0.278691,0.005619,0.878374,1.200985,0.241875,1.752773,1.520333,3138.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,987
109118,-0.386911,3.383639,-1.473089,-0.794018,-0.005429,-0.274401,-1.062098,0.368042,1513.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,299
205036,1.459708,1.207539,-0.733735,1.380249,-1.211843,-1.479046,0.016235,-1.168347,264.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1818


In [None]:
X = data.drop(columns=['Premium Amount'], axis=1)
y = data['Premium Amount']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=13)

BatchNormalisation normalises the output from a given layer of neurons so that the weights that they learn tend to be on the same scale of things. Helps against overfitting and lets models train faster as they would need less no of epochs to train for the same accuracy as without the normalization


Drop out is for randomly zeroing the   scores learnt from a given neuraon with the input probablity so that there is less overfitting chances as the less dominant  neurons also get to get their weights trained

In [47]:
class LinearRegressionforInsurance(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionforInsurance, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 1024),  # output size 1024
            nn.LeakyReLU(),
            nn.BatchNorm1d(1024),  # BatchNorm with 1024
            nn.Dropout(0.4)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(1024, 512),  # output size 512
            nn.LeakyReLU(),
            nn.BatchNorm1d(512),  # BatchNorm with 512
            nn.Dropout(0.3)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(512, 256),  # output size 256
            nn.LeakyReLU(),
            nn.BatchNorm1d(256),  # BatchNorm with 256
            nn.Dropout(0.2)
        )
        self.layer4 = nn.Sequential(
            nn.Linear(256, 128),  # output size 128
            nn.LeakyReLU(),
            nn.BatchNorm1d(128),  # BatchNorm with 128
            nn.Dropout(0.1)
        )
        self.layer5 = nn.Sequential(
            nn.Linear(128, 1)  # output size 1 (final prediction)
        )
    
    def forward(self, X):
        X = self.layer1(X)
        X = self.layer2(X)
        X = self.layer3(X)
        X = self.layer4(X)
        return self.layer5(X)

In [48]:
from torch.utils.data import DataLoader,TensorDataset
import numpy as np
X_train_t = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_t = torch.tensor(y_train.to_numpy(), dtype=torch.float32).unsqueeze(1)

X_test_t = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test_t = torch.tensor(y_test.to_numpy(), dtype=torch.float32).unsqueeze(1)

y_train_t = torch.log(y_train_t + 1)  
y_test_t = torch.log(y_test_t + 1)


#train_dataset = TensorDataset(X_train, y_train)
#test_dataset = TensorDataset(X_test,y_test)

#train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
#test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle = False)

In [49]:
print(X_test_t.shape[1])
print(y_test_t.shape)

29
torch.Size([3000, 1])


In [50]:
model0 = LinearRegressionforInsurance(X_train.shape[1])
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(params=model0.parameters(), lr = 1e-3, weight_decay=1e-2)
# weight decay is L2 regularization 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=125, gamma=0.5)
# the scheduler changes the learning  rate mid training

In [51]:
from torch.nn.utils.clip_grad import clip_grad_norm_

In [52]:
epochs = 500
max_grad_norm = 1.0
for epoch in range(epochs):
    model0.train()
    y_pred = model0(X_train_t)
    loss = loss_function(y_pred, y_train_t)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()
    clip_grad_norm_(model0.parameters(), max_grad_norm)
    with torch.no_grad():
        model0.eval()
        y_test_pred = model0(X_test_t)
        test_loss = loss_function(y_test_pred, y_test_t)
    if  epoch%25==0:
        print(f'EPOCH : {epoch + 1},  TRAINLOSS: {loss.item():.4f} , TESTLOSS: {test_loss.item():.4f}')

EPOCH : 1,  TRAINLOSS: 45.1829 , TESTLOSS: 56.2716
EPOCH : 26,  TRAINLOSS: 42.5894 , TESTLOSS: 42.3973
EPOCH : 51,  TRAINLOSS: 38.0598 , TESTLOSS: 37.5154
EPOCH : 76,  TRAINLOSS: 30.1734 , TESTLOSS: 29.7810
EPOCH : 101,  TRAINLOSS: 20.2751 , TESTLOSS: 20.1164
EPOCH : 126,  TRAINLOSS: 11.0875 , TESTLOSS: 10.8068
EPOCH : 151,  TRAINLOSS: 7.7115 , TESTLOSS: 6.9103
EPOCH : 176,  TRAINLOSS: 5.2461 , TESTLOSS: 4.8026
EPOCH : 201,  TRAINLOSS: 3.6210 , TESTLOSS: 2.9622
EPOCH : 226,  TRAINLOSS: 2.6521 , TESTLOSS: 2.0615
EPOCH : 251,  TRAINLOSS: 2.1651 , TESTLOSS: 1.5738
EPOCH : 276,  TRAINLOSS: 2.0167 , TESTLOSS: 1.4709
EPOCH : 301,  TRAINLOSS: 1.8937 , TESTLOSS: 1.3589
EPOCH : 326,  TRAINLOSS: 1.8463 , TESTLOSS: 1.2967
EPOCH : 351,  TRAINLOSS: 1.8266 , TESTLOSS: 1.2652
EPOCH : 376,  TRAINLOSS: 1.8145 , TESTLOSS: 1.2391
EPOCH : 401,  TRAINLOSS: 1.7634 , TESTLOSS: 1.2324
EPOCH : 426,  TRAINLOSS: 1.7212 , TESTLOSS: 1.2264
EPOCH : 451,  TRAINLOSS: 1.7484 , TESTLOSS: 1.2224
EPOCH : 476,  TRAINLOSS:

In [None]:
from sklearn.metrics import r2_score, root_mean_squared_log_error, root_mean_squared_error, mean_absolute_error

y_test_pred = model0(X_test_t)
y_test_pred_actual = torch.exp(y_test_pred) - 1
y_test_actual = torch.exp(y_test_t) - 1

# Detach the tensors and convert them to NumPy arrays
y_test_pred_actual = y_test_pred_actual.detach().numpy()
y_test_actual = y_test_actual.detach().numpy()

# During the evaluation phase:

rmsle = root_mean_squared_log_error(y_test_actual, y_test_pred_actual)
rmse = root_mean_squared_error(y_test_actual, y_test_pred_actual)
mae = mean_absolute_error(y_test_actual, y_test_pred_actual)

print(f' RMSLE: {rmsle:.4f}    | RMSE: {rmse:.4f}   | MAE {mae:.4f}')


R-squared: -0.2708   | RMSLE: 1.1016    | RMSE: 976.0803   | MAE 678.5445


In [54]:
import xgboost as xgb
y_Train_TT = np.log(y_train  + 1)
y_Test_TT = np.log(y_test  + 1)
boosting_reg = xgb.XGBRegressor()
boosting_reg.fit(X_train, y_Train_TT)
y_predXGB = boosting_reg.predict(X_test)

In [55]:
y_test_NL = np.exp(y_Test_TT) - 1
y_train_NL = np.exp(y_Train_TT) - 1 # NL IS FOR NON_LOG
error_XGB = root_mean_squared_log_error(y_Test_TT, y_predXGB)
print(error_XGB)

0.16963050893562365
