In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data_path = "insurance dataset.csv"
data = pd.read_csv(data_path)
data.set_index("id", inplace=True)

In [3]:
data = data.iloc[:10000,:]

In [4]:
data.shape

(10000, 20)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [6]:
numerical_pipeline = Pipeline(steps=[
    ("Impute values", SimpleImputer(strategy="median")),
    ("Standardizing the scale", StandardScaler())
     ])

categorical_pipeline = Pipeline(steps = [
    ('Impute Values', SimpleImputer(strategy="most_frequent")),
    ("Encoding data", OneHotEncoder())
])

In [7]:
X = data.drop(columns = ['Premium Amount'])
y = data['Premium Amount']
X.columns

Index(['Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type'],
      dtype='object')

In [8]:
preprocessor = ColumnTransformer(
    transformers = [
        ("numerical", numerical_pipeline, X.select_dtypes("number").columns),
        ("categorical", categorical_pipeline, X.select_dtypes("object").columns)
    ]
)

PreProcessing_pipeline = Pipeline(steps=[("Preprocessing", preprocessor)])
print(PreProcessing_pipeline)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('Impute '
                                                                   'values',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('Standardizing '
                                                                   'the scale',
                                                                   StandardScaler())]),
                                                  Index(['Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('Impute '
                            

In [9]:
X = PreProcessing_pipeline.fit_transform(X)

In [10]:
X_dense = X.toarray()

In [11]:
import torch
X_tensor = torch.tensor(X_dense)
y_tensor = torch.tensor(y)

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_tensor,y_tensor, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([8000, 9310]),
 torch.Size([8000]),
 torch.Size([2000, 9310]),
 torch.Size([2000]))

In [13]:
import torch.nn as nn
import torch.optim as optim

class RegNN(nn.Module):
    def __init__(self, input_dim) :
        super(RegNN, self).__init__()
        self.l1 = nn.Linear(input_dim, 5000)
        self.l2 = nn.Linear(5000,1000)
        self.l3 = nn.Linear(1000,100)
        self.l4 = nn.Linear(100,10)
        self.l5 = nn.Linear(10,1)
       
    def forward(self,x):
         X = self.l1(x)
         X = torch.relu(X)  # Apply ReLU activation after each layer
         X = self.l2(X)
         X = torch.relu(X)
         X = self.l3(X)
         X = torch.relu(X)
         X = self.l4(X)
         X = torch.relu(X)
         X = self.l5(X)
         return X

In [14]:
input_dim = X_train.shape[1]
model = RegNN(input_dim = input_dim)

In [15]:
loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [16]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([8000, 9310]),
 torch.Size([8000]),
 torch.Size([2000, 9310]),
 torch.Size([2000]))

In [17]:
X_train = X_train.float()
y_train = y_train.float()
X_test = X_test.float()
y_test = y_test.float()

In [18]:
y_train = y_train.unsqueeze(1)  # Reshape from (8000,) to (8000, 1)
y_test = y_test.unsqueeze(1) 

In [19]:
epochs = 400

for epoch in range(epochs):
    model.train()
    y_pred = model(X_train)
    loss_value = loss(y_pred, y_train)
    
    optimizer.zero_grad()
    loss_value.backward()
    optimizer.step()
    
    if (epoch +1) % 20 == 0:
        print(f' training epoch {epoch + 1}. running loss is {loss_value.item():.4f}')

 training epoch 20. running loss is 770849.7500
 training epoch 40. running loss is 91050.0469
 training epoch 60. running loss is 38607.2148
 training epoch 80. running loss is 26541.7656
 training epoch 100. running loss is 22003.6719
 training epoch 120. running loss is 17640.8477
 training epoch 140. running loss is 13182.3408
 training epoch 160. running loss is 9236.7109
 training epoch 180. running loss is 6113.2881
 training epoch 200. running loss is 3877.8792
 training epoch 220. running loss is 2410.7974
 training epoch 240. running loss is 1504.9418
 training epoch 260. running loss is 954.4460
 training epoch 280. running loss is 623.3469
 training epoch 300. running loss is 415.1583
 training epoch 320. running loss is 277.9904
 training epoch 340. running loss is 185.3734
 training epoch 360. running loss is 119.9586
 training epoch 380. running loss is 77.3193
 training epoch 400. running loss is 48.7549


In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test, loss_fn):
    model.eval()  # Set model to evaluation mode
    
    with torch.no_grad():  # Disable gradient computation during evaluation
        y_test_pred = model(X_test)  # Forward pass on test data
        
        # Calculate loss
        loss_value = loss(y_test_pred, y_test)
        
        # Calculate additional metrics (optional)
        y_test_pred_np = y_test_pred.detach().numpy()  # Convert predictions to numpy array
        y_test_np = y_test.numpy()  # Convert true values to numpy array
        
        # Mean Squared Error (MSE)
        mse = mean_squared_error(y_test_np, y_test_pred_np)
        
        # Mean Absolute Error (MAE)
        mae = mean_absolute_error(y_test_np, y_test_pred_np)
        
        # R-squared (R2)
        r2 = r2_score(y_test_np, y_test_pred_np)
        
        return loss_value.item(), mse, mae, r2

test_loss, test_mse, test_mae, test_r2 = evaluate_model(model, X_test, y_test, loss)

In [26]:
print(f"Test Loss: {test_loss:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test R2: {test_r2:.4f}")

Test Loss: 923203.5000
Test MSE: 923203.5625
Test MAE: 727.4914
Test R2: -0.3174


In [27]:
torch.save(model.state_dict(),'model_weights.pth')

In [28]:
import lightgbm as lgb

lgbregression = lgb.LGBMRegressor()

x_train, x_test, Y_train, Y_test = train_test_split(X,y, train_size=0.8, random_state=13)

In [31]:
lgbregression.fit(x_train,Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 912
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 38
[LightGBM] [Info] Start training from score 1096.238500


In [32]:
Y_lgb_pred = lgbregression.predict(x_test)

In [33]:
MAE_lgb = mean_absolute_error(Y_test, Y_lgb_pred)
MSE_lgb = mean_squared_error(Y_test, Y_lgb_pred)
r2_lgb = r2_score(Y_test, Y_lgb_pred)

In [34]:
print(f"Test MSE: {MSE_lgb:.4f}")
print(f"Test MAE: {MAE_lgb:.4f}")
print(f"Test R2: {r2_lgb:.4f}")

Test MSE: 754350.1771
Test MAE: 671.5582
Test R2: 0.0050


In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'num_leaves': [30, 40, 50,60],
    'max_depth': [-1, 10, 20, 30],
    'learning_rate': [0.05, 0.1, 0.15, 0.01],
    'n_estimators': [50, 100, 150, 200]
}

In [36]:
grid_search = GridSearchCV(estimator=lgbregression, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(x_train, Y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 912
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 38
[LightGBM] [Info] Start training from score 1096.238500


In [37]:
grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 30}

In [38]:
best_model = grid_search.best_estimator_
y_grid = best_model.predict(x_test)

MAE_grid = mean_absolute_error(Y_test, y_grid)
MSE_grid = mean_squared_error(Y_test, y_grid)
r2_grid = r2_score(Y_test, y_grid)
print(f"Test MSE: {MSE_lgb:.4f}")
print(f"Test MAE: {MAE_lgb:.4f}")
print(f"Test R2: {r2_lgb:.4f}")


Test MSE: 754350.1771
Test MAE: 671.5582
Test R2: 0.0050
