In [1]:
# Essentials
import math
import numpy as np
import pandas as pd
import os
import csv
import random

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# utils
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight

# Preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# train & valid split
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


def dimension_pca(data,config):
  scaler = RobustScaler()
  data = scaler.fit_transform(data)
  pca = PCA(n_components=config["n_component"])
  pca.fit(data)
  X_pca = pca.transform(data)
  return X_pca

# some functions
def same_seed(seed): 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def predict(test_loader, model, device):
    # Set your model to evaluation mode.
    model.eval() 
    preds = []
    for x, y in test_loader:
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [2]:
raw_data = pd.read_csv('/content/drive/Shareddrives/共用/大三上/MDS/Claim.csv').drop(['ID'], axis=1)
display(raw_data.head())
print('The dimension of train dataset: ', raw_data.shape)
# print(raw_data['area_cluster'].unique())
raw_data['OUTCOME'].value_counts()

Unnamed: 0,AGE,GENDER,RACE,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,VEHICLE_TYPE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME
0,65+,female,majority,0-9y,high school,upper class,0.629027,1.0,after 2015,0.0,1.0,10238,12000.0,sedan,0,0,0,0.0
1,16-25,male,majority,0-9y,none,poverty,0.357757,0.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,0,1.0
2,16-25,female,majority,0-9y,high school,working class,0.493146,1.0,before 2015,0.0,0.0,10238,11000.0,sedan,0,0,0,0.0
3,16-25,male,majority,0-9y,university,working class,0.206013,1.0,before 2015,0.0,1.0,32765,11000.0,sedan,0,0,0,0.0
4,26-39,male,majority,10-19y,none,working class,0.388366,1.0,before 2015,0.0,0.0,32765,12000.0,sedan,2,0,1,1.0


The dimension of train dataset:  (10000, 18)


0.0    6867
1.0    3133
Name: OUTCOME, dtype: int64

In [3]:
df = raw_data
# df['make'].replace([1,2,3,4,5],['C1','C2','C3','C4','C5'], inplace=True)
# print(df['max_torque'].unique())
df.replace(['Yes', 'No'], [1,0], inplace=True)
df = pd.get_dummies(df, drop_first=True)
display(df.head())

Unnamed: 0,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,...,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,EDUCATION_none,EDUCATION_university,INCOME_poverty,INCOME_upper class,INCOME_working class,VEHICLE_YEAR_before 2015,VEHICLE_TYPE_sports car
0,0.629027,1.0,0.0,1.0,10238,12000.0,0,0,0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,0.357757,0.0,0.0,0.0,10238,16000.0,0,0,0,1.0,...,0,0,0,1,0,1,0,0,1,0
2,0.493146,1.0,0.0,0.0,10238,11000.0,0,0,0,0.0,...,0,0,0,0,0,0,0,1,1,0
3,0.206013,1.0,0.0,1.0,32765,11000.0,0,0,0,0.0,...,0,0,0,0,1,0,0,1,1,0
4,0.388366,1.0,0.0,0.0,32765,12000.0,2,0,1,1.0,...,1,0,0,1,0,0,0,1,1,0


In [4]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(raw_data.mean(),inplace=True)
percentage_result = df.isnull().sum()/raw_data.shape[0]
percentage_result = pd.DataFrame({"missing": percentage_result}) #"columns": percentage_result.keys(), 
print(percentage_result.sort_values(by='missing', ascending=False).head())

                          missing
CREDIT_SCORE                  0.0
GENDER_male                   0.0
VEHICLE_YEAR_before 2015      0.0
INCOME_working class          0.0
INCOME_upper class            0.0


  df.fillna(raw_data.mean(),inplace=True)


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
X = df.drop('OUTCOME',axis=1)
Y = df['OUTCOME']
# print(Y)
# print(X.columns)

In [6]:
# # He initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight)

# subclassing nn.Module to build a network
class Network(nn.Module):
    def __init__(self, input_dim):
        super(Network, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 40),
            nn.Dropout(0.25),
            # activation functions
            nn.ReLU(),
            nn.Linear(40, 25),
            # Dropout to solve overfitting
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(25, 6),
            # Dropout to solve overfitting
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(6,2),
            nn.Softmax(dim=1)
        )
        self.layers.apply(init_weights)

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x

# Choose a device to run the model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
config = {
    'seed': 507,
    'valid_ratio': 0.2,
    'test_ratio': 0.1,   
    'n_epochs': 200,             
    'batch_size': 64, 
    'learning_rate': 4e-3,              
    'early_stop': 50,
    'n_component': 0.99,      
    'save_path': './model.ckpt'  
}


# PCA
# X_pca = X
X_pca = dimension_pca(X, config)
print(X_pca.shape[1])

model = Network(input_dim = X_pca.shape[1]).to(device)
# Loss Function
class_weights = np.array([1, 1])
class_weights=torch.tensor(class_weights,dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight = class_weights)

# optimzer
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay = 3e-7) 
# Set seed for reproducibility
same_seed(config['seed'])

Using cpu device
22


In [7]:
from torch.utils.data.sampler import WeightedRandomSampler
#Dataset
class Dataset(Dataset):
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.LongTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


# DataLoader
train_x, test_x, train_y, test_y = train_test_split(X_pca, Y.values, test_size = config['test_ratio'], random_state = config['seed'])
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size = config['valid_ratio'], random_state = config['seed'])


train_dataset, valid_dataset, test_dataset = Dataset(train_x, train_y), Dataset(valid_x, valid_y), Dataset(test_x, test_y)
print(np.count_nonzero(test_y == 0))

# Sampler
class_count = np.array(np.unique(train_y, return_counts=True))[1]
print(np.array(np.unique(train_y, return_counts=True)))
class_weights = 1.0/class_count
print(class_weights)
sample_weights = np.array([class_weights[int(i)] for i in train_y] ,dtype=np.float16)
sample_weights = torch.from_numpy(sample_weights)
sampler = WeightedRandomSampler(sample_weights.type('torch.DoubleTensor'),len(sample_weights))

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True, sampler = sampler)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

674
[[0.000e+00 1.000e+00]
 [4.971e+03 2.229e+03]]
[0.00020117 0.00044863]


In [8]:

# Train the model
def trainer(train_loader, valid_loader, model, config, device):

    n_epochs = config['n_epochs']
    best_loss, step, early_stop_count = math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train()
        loss_record = []

        for x, y in train_loader:
            # gradient = zero.
            optimizer.zero_grad()               
            x, y = x.to(device), y.to(device)   
            pred = model(x)             
            loss = criterion(pred, y)
            # L2 norm
            l2_lambda = 0.01
            l2_reg = torch.tensor(0.)
            l2_reg = l2_reg.to(device)
            for param in model.parameters():
              l2_reg += torch.norm(param)
            loss += l2_lambda * l2_reg            
            loss.backward()                    
            optimizer.step()                    
            step += 1
            loss_record.append(loss.detach().item())

        mean_train_loss = sum(loss_record)/len(loss_record)

        # model evaluation, no dropout
        model.eval() 
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')


        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) 
            print('Saving model with loss {:.3f}'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\n Model is not improving, stop training session.')
            return

trainer(train_loader, valid_loader, model, config, device)



Epoch [1/200]: Train loss: 0.7847, Valid loss: 0.5034
Saving model with loss 0.503
Epoch [2/200]: Train loss: 0.6754, Valid loss: 0.4993
Saving model with loss 0.499
Epoch [3/200]: Train loss: 0.6292, Valid loss: 0.4825
Saving model with loss 0.482
Epoch [4/200]: Train loss: 0.6134, Valid loss: 0.4898
Epoch [5/200]: Train loss: 0.5992, Valid loss: 0.4893
Epoch [6/200]: Train loss: 0.5947, Valid loss: 0.4919
Epoch [7/200]: Train loss: 0.5856, Valid loss: 0.4871
Epoch [8/200]: Train loss: 0.5833, Valid loss: 0.4817
Saving model with loss 0.482
Epoch [9/200]: Train loss: 0.5837, Valid loss: 0.4833
Epoch [10/200]: Train loss: 0.5826, Valid loss: 0.4938
Epoch [11/200]: Train loss: 0.5819, Valid loss: 0.4825
Epoch [12/200]: Train loss: 0.5797, Valid loss: 0.4925
Epoch [13/200]: Train loss: 0.5790, Valid loss: 0.5000
Epoch [14/200]: Train loss: 0.5755, Valid loss: 0.4774
Saving model with loss 0.477
Epoch [15/200]: Train loss: 0.5764, Valid loss: 0.4970
Epoch [16/200]: Train loss: 0.5764, Val

In [9]:
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 

In [10]:
def test(model, test_loader):
    model.eval()
    test_loss = []
    correct = 0
    y_pred = []
    y_true = []
    for x, y in test_loader:
      with torch.no_grad():
        # Prediction
        x,y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output, y)
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(y.view_as(pred)).sum().item() # how many predictions in this batch are correct
        y_pred.extend(pred.view(-1).detach().cpu().numpy())
        # print(y_pred.count(1))
        y_true.extend(y.view(-1).detach().cpu().numpy())
      test_loss.append(loss.item())
    cf_matrix = confusion_matrix(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print('Confusion Matrix: \n',cf_matrix,'\nF1 score: ',f1,'\n')
    mean_test_loss = sum(test_loss)/len(test_loss)

    print(f'\nTest set: Average loss: {mean_test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')
    print(classification_report(y_true, y_pred, target_names = ['not claim','is claim']))
test(model, test_loader)

Confusion Matrix: 
 [[561 113]
 [ 50 276]] 
F1 score:  0.772027972027972 


Test set: Average loss: 0.4729, Accuracy: 837/1000 (84%)

              precision    recall  f1-score   support

   not claim       0.92      0.83      0.87       674
    is claim       0.71      0.85      0.77       326

    accuracy                           0.84      1000
   macro avg       0.81      0.84      0.82      1000
weighted avg       0.85      0.84      0.84      1000



In [11]:
with open('pred.csv', 'w') as fp:
  writer = csv.writer(fp)
  writer.writerow(['INDEX', 'Is_claim'])
  for i, p in enumerate(preds):
    writer.writerow([i+1, p])     