In [3]:
import pandas as pd
from pathlib import Path

df = pd.read_parquet(Path.cwd() / 'deals_fleet_vehicles_historical_data.parq', engine='fastparquet')

In [4]:
df.columns

Index(['type_insurance_offered', 'fleet_size', 'utilization_rate_per_car',
       'avg_age_of_vehicles', 'avg_driver_behaviour', 'fleet_location',
       'expected_loss'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,fleet_size,utilization_rate_per_car,avg_age_of_vehicles,expected_loss
count,100000.0,100000.0,100000.0,100000.0
mean,498.89038,49.02306,4.49642,239046.2
std,287.734625,28.582276,2.870002,207400.8
min,2.0,0.0,0.0,0.0
25%,249.0,24.0,2.0,78741.0
50%,497.5,49.0,4.0,183874.0
75%,748.0,74.0,7.0,342381.5
max,999.0,98.0,9.0,1200870.0


In [10]:
list(df['type_insurance_offered'].unique())

['full', 'partial']

In [11]:

list(df['avg_driver_behaviour'].unique())

['conservative', 'aggressive', 'regular']

In [12]:

list(df['fleet_location'].unique())

['Drusselstein', 'Neutralia', 'Apollonia', 'Krakozhia', 'Peaceland']

In [19]:
df.dtypes

type_insurance_offered       object
fleet_size                    int64
utilization_rate_per_car      int64
avg_age_of_vehicles           int64
avg_driver_behaviour         object
fleet_location               object
expected_loss               float64
dtype: object

In [22]:
df_copy = df.copy()
for col in df.select_dtypes(include=['object']).columns:
    df_copy[col] = df_copy[col].factorize()[0]


Unnamed: 0,type_insurance_offered,fleet_size,utilization_rate_per_car,avg_age_of_vehicles,avg_driver_behaviour,fleet_location,expected_loss
0,0,27,94,3,0,0,25407.0
1,0,694,94,1,1,1,793242.0
2,0,97,27,8,1,2,48500.0
3,0,735,83,6,0,1,619605.0
4,1,830,0,6,2,3,92960.0


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, Tensor
import numpy as np

class RegressionDataset(Dataset):
    def __init__(self, df: pd.DataFrame, target_column: str='expected_loss'):
        self.n = df.shape[0]

        self.y: Tensor = torch.from_numpy(df[target_column].values.astype(np.float64).reshape(-1, 1))
    
        x_columns = list(filter(lambda name: name != target_column, df.columns.values))
        self.x: Tensor = torch.from_numpy(df[x_columns].values.astype(np.float64))

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class LinearRegression(nn.Module):
    def __init__(self, input_dim: int):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1, dtype=torch.float64)

    def forward(self, x):
        return self.linear(x)

def train_model(df: pd.DataFrame, target_column: str, epochs=30, learning_rate=0.01):
    dataset = RegressionDataset(df, target_column)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    model = LinearRegression(df.shape[1] - 1)
    criterion = nn.L1Loss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    for epoch in range(epochs):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        print(f"{epoch} {loss.item()}")
    
    return model

model = train_model(df_copy, 'expected_loss')


In [142]:
torch.save(model.state_dict(), Path.cwd() / 'model.pt')

In [None]:
df_sample = df_copy.sample(100)

from time import process_time_ns

def evaluate_model(df, model, target_column):
    start_time = process_time_ns()
    
    labels: Tensor = torch.from_numpy(df[target_column].values.astype(np.float64).reshape(-1, 1))
    
    x_columns = list(filter(lambda name: name != target_column, df.columns.values))
    inputs: Tensor = torch.from_numpy(df[x_columns].values.astype(np.float64))
    
    with torch.no_grad():
        predicted = model(inputs)

    accuracy = torch.nn.functional.l1_loss(predicted, labels)
    calc_time = process_time_ns() - start_time
    
    return accuracy.item(), calc_time

evaluate_model(df_sample, model, 'expected_loss')