In [32]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [131]:
training_data = "./insurance/insurance_training.csv"
testing_data_file = "./insurance/insurance_predict.csv"
Y_testing_data_file = "./insurance/insurance_sample.csv"

In [132]:
data = pd.read_csv(training_data)
testing_data = pd.read_csv(testing_data)

In [133]:
columns = testing_data.columns.values

In [134]:
le = LabelEncoder()

In [136]:
for column in columns[1:-1]:
    temp_data = data[column].append(testing_data[column])
    print(column, temp_data.unique())
    le.fit(temp_data.unique())
    data[column] = le.transform(data[column])
    testing_data[column] = le.transform(testing_data[column])
    
# 0/1 for the column that is to be predicted
# this is done seperately so as to avoid the error with NaN values for labelEncoder
data[columns[-1]] = data[columns[-1]].map(dict(Yes=1, No=0 ))

Sex [1 0]
Age.Class [3 0 1 4 2 5]
Education [3 0 2 1]
Marital.Status [1 4 0 5 2 3]
Income.Group [0 2 3 1]
Need.help.making.financial.decisions [2 3 0 4 1]
Do.not.like.to.be.in.debt.at.anytime [2 4 1 0 3]
Feel.satistifed.when.I.get.a.really.good.deal [4 2 0 1 3]
Like.to.buy.products.with.prestigious.brand.names [0 4 3 2 1]
Always.try.to.buy.things.on.sale [4 1 0 2 3]
Once.find.brand.that.satistifies..don.t.experiment [0 4 3 2 1]
Confident.will.have.enough.money.to.retire [4 0 1 2 3]


In [127]:
# print(data.head())
print(testing_data.head())

  Occupation.Class  Sex  Age.Class  Education  Marital.Status  Income.Group  \
0      Blue Collar    0          2          3               1             2   
1      Blue Collar    1          1          3               4             2   
2     White Collar    1          3          0               1             2   
3              NaN    1          0          3               4             0   
4              NaN    1          5          2               1             0   

   Need.help.making.financial.decisions  Do.not.like.to.be.in.debt.at.anytime  \
0                                     0                                     1   
1                                     0                                     1   
2                                     0                                     0   
3                                     4                                     0   
4                                     3                                     1   

   Feel.satistifed.when.I.get.a.really

In [126]:
# data.head()

In [42]:
X_data = data[columns[1:-1]]
Y_data = data[columns[-1]]
# X_data, Y_data

In [43]:
# X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2)

In [158]:
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [72]:
# Logistic Regression
l = LogisticRegression()
# l.fit(X_train,Y_train)
# Y_pred = l.predict_proba(X_test)
# print(log_loss(Y_test,Y_pred))
score = cross_val_score(l, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()



-0.6615584106474699

In [73]:
# Random Forest
r = RandomForestClassifier(n_estimators=50,max_depth=5)
# r.fit(X_train,Y_train)
# Y_pred = r.predict_proba(X_test)
# print(log_loss(Y_test,Y_pred))
score = cross_val_score(r, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()

-0.6414113726530675

In [98]:
# AdaBoost Algorithm
ab = AdaBoostClassifier(n_estimators=100, learning_rate=0.01)
# n_estimators : 50 (default value) 
# base_estimator : DecisionTreeClassifier (default value)
# ab.fit(X_train,Y_train)
# Y_pred = ab.predict(X_test)
# print(log_loss(Y_test,Y_pred))
score = cross_val_score(ab, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()

-0.669829090021354

In [123]:
# AdaBoost Algorithm with Support Vector Classifier
svcl=SVC(probability=True, kernel='linear')
abc = AdaBoostClassifier(n_estimators=5, base_estimator=svcl, learning_rate=0.1)
abc.fit(X_train, Y_train)
Y_pred = abc.predict(X_test)
print(log_loss(Y_test,Y_pred))

12.232613241113272


In [94]:
# Gradient Boosting 
gb = GradientBoostingClassifier(n_estimators=50, learning_rate=0.01, max_depth=15, random_state=0)
score = cross_val_score(gb, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()

-0.7135096807841211

In [109]:
# Decision Tree 
dt = DecisionTreeClassifier(max_depth=3)
score = cross_val_score(dt, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()

-0.6654351959948451

In [122]:
# KNN
knn = KNeighborsClassifier(n_neighbors=200)
score = cross_val_score(knn, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()

-0.6570058728645133

In [124]:
# Support Vector Classifier
svcr = SVC(gamma='scale', kernel='rbf', probability=True)
score = cross_val_score(svcr, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()


-0.6757158209965047

In [125]:
vc = VotingClassifier(estimators=[('dt', dt), ('knn', knn), ('svcr', svcr), ('gb', gb), ('ab', ab), ('l', l), ('r', r)],
                      voting='soft', weights=[1, 1, 1, 1, 1, 1, 1])
score = cross_val_score(vc, X_data, Y_data, scoring='neg_log_loss', cv=5)  
score.mean()



-0.6538049362949223

In [150]:
# testing_data.head()
X_final_test = testing_data[columns[1:-1]]

In [151]:
X_final_test.head()

Unnamed: 0,Sex,Age.Class,Education,Marital.Status,Income.Group,Need.help.making.financial.decisions,Do.not.like.to.be.in.debt.at.anytime,Feel.satistifed.when.I.get.a.really.good.deal,Like.to.buy.products.with.prestigious.brand.names,Always.try.to.buy.things.on.sale,Once.find.brand.that.satistifies..don.t.experiment,Confident.will.have.enough.money.to.retire
0,0,2,3,1,2,0,1,1,1,1,1,2
1,1,1,3,4,2,0,1,1,0,0,1,0
2,1,3,0,1,2,0,0,1,1,1,1,1
3,1,0,3,4,0,4,0,2,2,2,4,0
4,1,5,2,1,0,3,1,3,1,3,1,1


In [152]:
Y_final_test = pd.read_csv(Y_testing_data_file)
Y_final_test[columns[-1]] = Y_final_test[columns[-1]].map(dict(Yes=1, No=0))

In [153]:
Y_final_test.head()

Unnamed: 0,Have.life.insurance
0,1
1,0
2,1
3,1
4,1


In [154]:
vc.fit(X_data, Y_data)
Y_final_pred = vc.predict(X_final_test)
print(log_loss(Y_final_test,Y_final_pred))



16.66508954512681


In [162]:
Y_final_pred.sum(), Y_final_test.sum(), Y_final_test.count()

(119, Have.life.insurance    182
 dtype: int64, Have.life.insurance    400
 dtype: int64)

In [159]:
print(f1_score(Y_final_test, Y_final_pred))

0.3588039867109635
