In [35]:
import random
import pandas as pd
import numpy as np

import plotly
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import (f1_score, roc_auc_score,accuracy_score,confusion_matrix,
                             precision_recall_curve, auc, roc_curve, recall_score,classification_report)

from plot_func import plot_classification_report,plot_roc,plot_confusion_matrix

import plotly.express as px

import pickle

In [14]:
# Reading data
original_data = pd.read_csv('../Health_Insurance_Cross_Sell_Prediction.csv')
original_data = original_data[0:int(len(original_data)/20)]

In [15]:
original_data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [16]:
columns = original_data.columns

In [17]:
columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [18]:
train,test = train_test_split(original_data,test_size=0.25,random_state=0)

### Data Preprocessing



In [19]:
num_feat = ['Age','Vintage']
cat_feat = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age_lt_1_Year','Vehicle_Age_gt_2_Years','Vehicle_Damage_Yes','Region_Code','Policy_Sales_Channel']

In [20]:
train['Gender'] = train['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)
train=pd.get_dummies(train,drop_first=True)
train=train.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})
train['Vehicle_Age_lt_1_Year']=train['Vehicle_Age_lt_1_Year'].astype('int')
train['Vehicle_Age_gt_2_Years']=train['Vehicle_Age_gt_2_Years'].astype('int')
train['Vehicle_Damage_Yes']=train['Vehicle_Damage_Yes'].astype('int')


ss = StandardScaler()
train[num_feat] = ss.fit_transform(train[num_feat])

mm = MinMaxScaler()
train[['Annual_Premium']] = mm.fit_transform(train[['Annual_Premium']])

train=train.drop('id',axis=1)

for column in cat_feat:
    train[column] = train[column].astype('str')

In [21]:
test['Gender'] = test['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)
test=pd.get_dummies(test,drop_first=True)
test=test.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})
test['Vehicle_Age_lt_1_Year']=test['Vehicle_Age_lt_1_Year'].astype('int')
test['Vehicle_Age_gt_2_Years']=test['Vehicle_Age_gt_2_Years'].astype('int')
test['Vehicle_Damage_Yes']=test['Vehicle_Damage_Yes'].astype('int')
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
test[num_feat] = ss.fit_transform(test[num_feat])


mm = MinMaxScaler()
test[['Annual_Premium']] = mm.fit_transform(test[['Annual_Premium']])
for column in cat_feat:
    test[column] = test[column].astype('str')


In [22]:
from sklearn.model_selection import train_test_split

train_target=train['Response']
train=train.drop(['Response'], axis = 1)
x_train,x_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)


### Using saved models

In [23]:
model = pickle.load(open('models/AdaBoost_model.sav', 'rb'))

In [46]:
model_names = [
    "Nearest Neighbors","SVC","Decision Tree","Random Forest","Neural Net","AdaBoost","Naive Bayes","QDA"
]


In [28]:
pickle_file_map = {
    "Nearest Neighbors":'models/'+"Nearest Neighbors"+'_model.sav',
    "SVC":'models/'+"SVC"+'_model.sav',
    "Decision Tree":'models/'+"Decision Tree"+'_model.sav',
    "Random Forest":'models/'+"Random Forest"+'_model.sav',
    "Neural Net":'models/'+"Neural Net"+'_model.sav',
    "AdaBoost":'models/'+"AdaBoost"+'_model.sav',
    "Naive Bayes":'models/'+"Naive Bayes"+'_model.sav',
    "QDA":'models/'+"QDA"+'_model.sav'
}

In [31]:
for key in pickle_file_map.keys():
    print(pickle.load(open(pickle_file_map[key], 'rb')))

RandomizedSearchCV(cv=4, estimator=KNeighborsClassifier(), n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [20, 30, 40],
                                        'metric': ['minkowski'], 'n_jobs': [-1],
                                        'n_neighbors': [4, 5, 6, 7, 8],
                                        'p': [2, 3],
                                        'weights': ['uniform', 'distance']},
                   random_state=101, verbose=1)
RandomizedSearchCV(cv=4, estimator=SVC(), n_jobs=-1,
                   param_distributions={'C': [1.0, 2.0], 'degree': [2, 3, 4],
                                        'gamma': ['scale'],
                                        'kernel': ['poly', 'rbf'],
                                        'probability': [True]},
                   random_state=101, verbose=1)
RandomizedSe

In [34]:
pd.DataFrame(pd.Series(pickle_file_map))

Unnamed: 0,0
Nearest Neighbors,models/Nearest Neighbors_model.sav
SVC,models/SVC_model.sav
Decision Tree,models/Decision Tree_model.sav
Random Forest,models/Random Forest_model.sav
Neural Net,models/Neural Net_model.sav
AdaBoost,models/AdaBoost_model.sav
Naive Bayes,models/Naive Bayes_model.sav
QDA,models/QDA_model.sav


In [42]:
def get_ClassificationReport_test(model_name):
    model = pickle.load(open(pickle_file_map[model_name], 'rb'))
    y_pred = model.predict(x_test)
    return classification_report(y_test, y_pred,zero_division=0)

In [44]:
print(get_ClassificationReport_test('Nearest Neighbors'))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      3127
           1       0.52      0.14      0.22       446

    accuracy                           0.88      3573
   macro avg       0.70      0.56      0.57      3573
weighted avg       0.84      0.88      0.84      3573



In [47]:
pd.DataFrame(model_names)

Unnamed: 0,0
0,Nearest Neighbors
1,SVC
2,Decision Tree
3,Random Forest
4,Neural Net
5,AdaBoost
6,Naive Bayes
7,QDA


In [56]:
print(get_ClassificationReport_test('QDA'))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90      3127
           1       0.33      0.33      0.33       446

    accuracy                           0.83      3573
   macro avg       0.62      0.62      0.62      3573
weighted avg       0.83      0.83      0.83      3573

