# Binary Classification of Insurance Cross Selling 

In [1]:
import polars as pl
import torch

device = "cuda" if torch.cuda.is_available else "cpu"
torch.set_default_device(device)
print(torch.cuda.get_device_name(torch.cuda.current_device()))

NVIDIA GeForce GTX 1650 SUPER


## Data Formatting

In [2]:
df = pl.read_csv("train.csv")

In [3]:
df.head()

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,str,i64,i64,f64,i64,str,str,f64,f64,i64,i64
0,"""Male""",21,1,35.0,0,"""1-2 Year""","""Yes""",65101.0,124.0,187,0
1,"""Male""",43,1,28.0,0,"""> 2 Years""","""Yes""",58911.0,26.0,288,1
2,"""Female""",25,1,14.0,1,"""< 1 Year""","""No""",38043.0,152.0,254,0
3,"""Female""",35,1,1.0,0,"""1-2 Year""","""Yes""",2630.0,156.0,76,0
4,"""Female""",36,1,15.0,1,"""1-2 Year""","""No""",31951.0,152.0,294,0


### Fields that are fine as is
We'll just have them return themselves


In [4]:
def identity(col) -> pl.Series: return col

### Functions for values that need to be converted to binary/ternary values: <ul>
<li>Gender</li>
<li>Vehicle Age</li>
<li>Vehicle Damage</li>
</ul>

In [5]:
def make_gender_binary(col: pl.Series) -> pl.Series:
    return col.replace({
        "Male": 0,
        "Female": 1
    })

In [6]:
def string_age_to_int(age):
    if age == "> 2 Years": return 2
    
    elif age == "1-2 Year": return 1

    else: return 0

def make_vehicle_age_binary(col: pl.Series) -> pl.Series:
    return col.replace({
        "> 2 Years": 2,
        "1-2 Year": 1,
        "< 1 Year": 0
    })

In [7]:
def make_vehicle_damage_binary(col: pl.Series) -> pl.Series:
    return col.replace({
        "Yes": 1,
        "No": 0
    })

### Standardizing Numerical Cols
To be standardized:
<ul>
<li>Age</li>
<li>Annual Premium</li>
<li>Vintage</li>
</ul>

In [8]:
def standardize(col: pl.Series) -> pl.Series:
    return (col - col.mean()) / col.std()

### Numeric Values Without Ordering
<i>i.e. the columns that are numbers, but don't have any intrinsic meaning to those numbers </i>
<ul>
<li>Region Code</li>
<li>Sales Channel</li>
</ul>

In [9]:
def non_ordered_numeric_values(col: pl.Series, reasonable_arg_count = 20) -> pl.DataFrame:
    return col.to_dummies()

### Formatting It All
(Gender,	Age,	Driving_License,	Region_Code,	Previously_Insured,	Vehicle_Age,	Vehicle_Damage,	Annual_Premium,	Policy_Sales_Channel,	Vintage	Response)

In [10]:
default_features = {
    "Gender": make_gender_binary,
    "Age": standardize,
    "Driving_License": identity,
    "Region_Code": "dummies",
    "Previously_Insured": identity,
    "Vehicle_Age": make_vehicle_age_binary,
    "Vehicle_Damage": make_vehicle_damage_binary,
    "Annual_Premium": standardize,
    "Policy_Sales_Channel": "dummies",
    "Vintage": standardize
}

def format_df(df: pl.DataFrame, features: dict = default_features) -> pl.DataFrame:
    df_copy = df.__copy__()

    offset = 0
    n = len(df.columns)
    for i in range(n):
        col = df.columns[i]

        if col in features.keys():
            if features[col] == "dummies":
                df_copy = df_copy.to_dummies(col)
                offset = len(df_copy.columns) - n
            else: 
                replace: pl.Series = features[col](df[col])
                replace = replace.rename(replace.name + "_new")
                df_copy = df_copy.replace_column(i + offset, replace)
    
    for non_used in set(df.columns) - set(features.keys()):
        df_copy = df_copy.drop(non_used)
    print(df_copy.columns)
    df_copy = df_copy.cast(pl.Float64)
    return df_copy

## Making The Model

In [11]:
train_X = format_df(df)
train_y = df["Response"]

print(train_X)
print(train_y)

['Gender_new', 'Age_new', 'Driving_License_new', 'Region_Code_0.0', 'Region_Code_1.0', 'Region_Code_10.0', 'Region_Code_11.0', 'Region_Code_12.0', 'Region_Code_13.0', 'Region_Code_14.0', 'Region_Code_15.0', 'Region_Code_16.0', 'Region_Code_17.0', 'Region_Code_18.0', 'Region_Code_19.0', 'Region_Code_2.0', 'Region_Code_20.0', 'Region_Code_21.0', 'Region_Code_22.0', 'Region_Code_23.0', 'Region_Code_24.0', 'Region_Code_25.0', 'Region_Code_26.0', 'Region_Code_27.0', 'Region_Code_28.0', 'Region_Code_29.0', 'Region_Code_3.0', 'Region_Code_30.0', 'Region_Code_31.0', 'Region_Code_32.0', 'Region_Code_33.0', 'Region_Code_34.0', 'Region_Code_35.0', 'Region_Code_36.0', 'Region_Code_37.0', 'Region_Code_38.0', 'Region_Code_39.0', 'Region_Code_39.2', 'Region_Code_4.0', 'Region_Code_40.0', 'Region_Code_41.0', 'Region_Code_42.0', 'Region_Code_43.0', 'Region_Code_44.0', 'Region_Code_45.0', 'Region_Code_46.0', 'Region_Code_47.0', 'Region_Code_48.0', 'Region_Code_49.0', 'Region_Code_5.0', 'Region_Code_50.0

### Layers

In [12]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(in_features=len(train_X.columns), out_features=100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 50),
    nn.ReLU(),
    nn.Linear(50, 25),
    nn.ReLU(),
    nn.Linear(25, 10),
    nn.ReLU(),
    nn.Linear(10, 2),
    nn.Softmax(dim = 0)
)

loss_fn = nn.MSELoss()

In [13]:
torch_train_X = train_X.to_torch()
torch_train_Y = train_y.to_torch()

In [14]:
import torch.utils.data as data

class UserData(data.Dataset):
    def __init__(self, X: torch.Tensor, y: torch.Tensor):
        self.X: torch.Tensor = X
        self.y: torch.Tensor = y
    
    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, index):
        params = self.X[index]
        label = self.y[index]
        
        return params, label

In [17]:
n = torch_train_Y.shape[0]

split = int(n * .95 // 1)

training_d = UserData(torch_train_X[:split], torch_train_Y[:split])
validation_d = UserData(torch_train_X[split:], torch_train_Y[split:])

training_data_loader = data.DataLoader(dataset=training_d, shuffle=True, generator=torch.Generator(device=device), pin_memory=True, pin_memory_device="cuda:0")
validation_data_loader = data.DataLoader(dataset=validation_d, pin_memory=True, pin_memory_device="cuda:0")

In [18]:
epochs = 1
learning_rate = 0.05

for e in range(epochs):
    t = 1
    interval_loss_sum = 0
    interval_length = 1000

    for batch, labels in training_data_loader:
        batch = batch.to(device).to(torch.float32); labels = labels.to(device).to(torch.torch.float32)
        
        y_pred = model(batch)
        loss = loss_fn(input = y_pred, target = labels)
        
        interval_loss_sum += loss
        if t % interval_length == 0:
            print(interval_loss_sum)
            interval_loss_sum = 0

        model.zero_grad()
        loss.backward()

        with torch.no_grad():
            for param in model.parameters():
                param -= learning_rate * param.grad
            
        t += 1

  return func(*args, **kwargs)


tensor(887., device='cuda:0', grad_fn=<AddBackward0>)
tensor(884., device='cuda:0', grad_fn=<AddBackward0>)
tensor(867., device='cuda:0', grad_fn=<AddBackward0>)
tensor(884., device='cuda:0', grad_fn=<AddBackward0>)
tensor(870., device='cuda:0', grad_fn=<AddBackward0>)
tensor(868., device='cuda:0', grad_fn=<AddBackward0>)
tensor(879., device='cuda:0', grad_fn=<AddBackward0>)
tensor(869., device='cuda:0', grad_fn=<AddBackward0>)
tensor(856., device='cuda:0', grad_fn=<AddBackward0>)
tensor(867., device='cuda:0', grad_fn=<AddBackward0>)
tensor(858., device='cuda:0', grad_fn=<AddBackward0>)
tensor(879., device='cuda:0', grad_fn=<AddBackward0>)
tensor(864., device='cuda:0', grad_fn=<AddBackward0>)
tensor(879., device='cuda:0', grad_fn=<AddBackward0>)
tensor(891., device='cuda:0', grad_fn=<AddBackward0>)
tensor(886., device='cuda:0', grad_fn=<AddBackward0>)
tensor(846., device='cuda:0', grad_fn=<AddBackward0>)
tensor(878., device='cuda:0', grad_fn=<AddBackward0>)
tensor(882., device='cuda:0'

KeyboardInterrupt: 