## 5305-Final Project

In [2]:
import torch 
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import seaborn as sns 
import matplotlib.pyplot as plt
import torch.optim as optim

In [14]:
df = pd.read_csv('./Insurance Premium Prediction Dataset.csv')

In [16]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,56.0,Male,99990.0,Married,1.0,Master's,,31.074627,Urban,Comprehensive,,13,320.0,5,308.0,2022-12-10 15:21:39.078837,Poor,Yes,Daily,Condo
1,46.0,Male,2867.0,Single,1.0,Bachelor's,,50.271335,Urban,Comprehensive,,3,694.0,4,517.0,2023-01-31 15:21:39.078837,Good,Yes,Monthly,House
2,32.0,Female,30154.0,Divorced,3.0,Bachelor's,,14.714909,Suburban,Comprehensive,2.0,16,652.0,8,849.0,2023-11-26 15:21:39.078837,Poor,No,Monthly,House
3,60.0,Female,48371.0,Divorced,0.0,PhD,Self-Employed,25.346926,Rural,Comprehensive,1.0,11,330.0,7,927.0,2023-02-27 15:21:39.078837,Poor,No,Rarely,Condo
4,25.0,Female,54174.0,Divorced,0.0,High School,Self-Employed,6.659499,Urban,Comprehensive,,9,,8,303.0,2020-11-25 15:21:39.078837,Poor,No,Rarely,Condo


In [17]:
print(df.isnull().any())

Age                      True
Gender                  False
Annual Income            True
Marital Status           True
Number of Dependents     True
Education Level         False
Occupation               True
Health Score             True
Location                False
Policy Type             False
Previous Claims          True
Vehicle Age             False
Credit Score             True
Insurance Duration      False
Premium Amount           True
Policy Start Date       False
Customer Feedback        True
Smoking Status          False
Exercise Frequency      False
Property Type           False
dtype: bool


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278860 entries, 0 to 278859
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Age                   274175 non-null  float64
 1   Gender                278860 non-null  object 
 2   Annual Income         264905 non-null  float64
 3   Marital Status        273841 non-null  object 
 4   Number of Dependents  250974 non-null  float64
 5   Education Level       278860 non-null  object 
 6   Occupation            197572 non-null  object 
 7   Health Score          268263 non-null  float64
 8   Location              278860 non-null  object 
 9   Policy Type           278860 non-null  object 
 10  Previous Claims       197572 non-null  float64
 11  Vehicle Age           278860 non-null  int64  
 12  Credit Score          250974 non-null  float64
 13  Insurance Duration    278860 non-null  int64  
 14  Premium Amount        277019 non-null  float64
 15  

In [23]:
# Summarize the total missing values and their percentage
missing_summary = df.isnull().sum().reset_index()
missing_summary.columns = ['Column', 'Missing Values']
missing_summary['% Missing'] = (missing_summary['Missing Values'] / len(df)) * 100
print(missing_summary)

                  Column  Missing Values  % Missing
0                    Age            4685   1.680055
1                 Gender               0   0.000000
2          Annual Income           13955   5.004303
3         Marital Status            5019   1.799828
4   Number of Dependents           27886  10.000000
5        Education Level               0   0.000000
6             Occupation           81288  29.150111
7           Health Score           10597   3.800115
8               Location               0   0.000000
9            Policy Type               0   0.000000
10       Previous Claims           81288  29.150111
11           Vehicle Age               0   0.000000
12          Credit Score           27886  10.000000
13    Insurance Duration               0   0.000000
14        Premium Amount            1841   0.660188
15     Policy Start Date               0   0.000000
16     Customer Feedback           18349   6.580004
17        Smoking Status               0   0.000000
18    Exerci

## Defining a custom `InsuranceDataset` class that inherits the `Dataset` class.

In [3]:
class InsuranceDataset(Dataset):
    def __init__(self):
        self.data = pd.read_csv('./insurance_data_imputed.csv')
        self.X = self.data.drop('premium_amount', axis=1)
        self.y = self.data['premium_amount']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.X.iloc[idx].values, dtype=torch.float32), torch.tensor(self.y.iloc[idx], dtype=torch.float32)

dataset = InsuranceDataset()

In [4]:
dataset.X.columns


Index(['age', 'gender', 'annual_income', 'marital_status',
       'number_of_dependents', 'education_level', 'health_score', 'location',
       'policy_type', 'previous_claims', 'credit_score', 'insurance_duration',
       'smoking_status', 'exercise_frequency', 'occupation_employed',
       'occupation_self_employed', 'occupation_unemployed'],
      dtype='object')

In [5]:
# Feedforward Neural Network with 2 hidden layers, with 64 and 32 neurons respectively

class FeedForwardNN(nn.Module):
    def __init__(self):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(17, 64)
        self.fc2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)

        return self.output(x)

model = FeedForwardNN()

In [6]:
# training, testing and validation data
train_size = int(0.7 * len(dataset))
test_size = int(0.15 * len(dataset))
val_size = len(dataset) - train_size - test_size

train_dataset, test_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, test_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [7]:
criteria = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

# Training the model
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.float()  # Convert inputs to float

        # Forward pass
        outputs = model(inputs)
        loss = criteria(outputs.squeeze(), labels.float())  # Convert labels to float

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

Epoch [1/100], Step [100/4091], Loss: 1048.6968
Epoch [1/100], Step [200/4091], Loss: 647.5167
Epoch [1/100], Step [300/4091], Loss: 520.4061
Epoch [1/100], Step [400/4091], Loss: 701.0190
Epoch [1/100], Step [500/4091], Loss: 524.4326
Epoch [1/100], Step [600/4091], Loss: 782.1561
Epoch [1/100], Step [700/4091], Loss: 781.2236
Epoch [1/100], Step [800/4091], Loss: 568.8615
Epoch [1/100], Step [900/4091], Loss: 587.7375
Epoch [1/100], Step [1000/4091], Loss: 858.5614
Epoch [1/100], Step [1100/4091], Loss: 858.4792
Epoch [1/100], Step [1200/4091], Loss: 903.1886
Epoch [1/100], Step [1300/4091], Loss: 627.6864
Epoch [1/100], Step [1400/4091], Loss: 761.5317
Epoch [1/100], Step [1500/4091], Loss: 584.5656
Epoch [1/100], Step [1600/4091], Loss: 786.5484
Epoch [1/100], Step [1700/4091], Loss: 797.1525
Epoch [1/100], Step [1800/4091], Loss: 686.7686
Epoch [1/100], Step [1900/4091], Loss: 793.7996
Epoch [1/100], Step [2000/4091], Loss: 532.8842
Epoch [1/100], Step [2100/4091], Loss: 490.3727


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Function to evaluate metrics
def evaluate_metrics(model, data_loader):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in data_loader:
            # Forward pass
            outputs = model(inputs.float()).squeeze()  # Model predictions
            all_predictions.extend(outputs.numpy())  # Convert to NumPy
            all_labels.extend(labels.float().numpy())  # Convert to NumPy

    # Convert to NumPy arrays
    y_true = torch.tensor(all_labels).numpy()
    y_pred = torch.tensor(all_predictions).numpy()

    # Calculate metrics
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2

# Example: Evaluate on test set
mse, mae, r2 = evaluate_metrics(model, test_loader)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 896230.0000
Mean Absolute Error (MAE): 657.5516
R-squared (R²): -0.1003
