In [3]:
#importing library all the classification models
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import warnings
warnings.filterwarnings(action='ignore')


In [5]:
#loading the data
df=pd.read_csv('../input/carinsurance/carInsurance_train.csv')
df

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
0,1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,,13:45:20,13:46:30,0
1,2,32,blue-collar,married,primary,0,1156,1,0,,26,may,5,-1,0,,14:49:03,14:52:08,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,failure,16:30:24,16:36:04,1
3,4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,,12:06:43,12:20:22,1
4,5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,,14:35:44,14:38:56,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3996,28,technician,single,tertiary,0,0,1,0,cellular,25,may,1,40,2,failure,17:46:28,17:50:57,1
3996,3997,49,admin.,divorced,secondary,0,124,1,1,cellular,29,jul,19,-1,0,,14:49:16,14:51:21,0
3997,3998,27,admin.,single,secondary,0,-400,0,1,cellular,8,jul,1,-1,0,,12:19:03,12:23:53,0
3998,3999,36,entrepreneur,single,tertiary,0,658,1,0,cellular,29,jan,1,227,3,failure,11:27:35,11:29:14,0


In [6]:
#getting information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4000 non-null   int64 
 1   Age               4000 non-null   int64 
 2   Job               3981 non-null   object
 3   Marital           4000 non-null   object
 4   Education         3831 non-null   object
 5   Default           4000 non-null   int64 
 6   Balance           4000 non-null   int64 
 7   HHInsurance       4000 non-null   int64 
 8   CarLoan           4000 non-null   int64 
 9   Communication     3098 non-null   object
 10  LastContactDay    4000 non-null   int64 
 11  LastContactMonth  4000 non-null   object
 12  NoOfContacts      4000 non-null   int64 
 13  DaysPassed        4000 non-null   int64 
 14  PrevAttempts      4000 non-null   int64 
 15  Outcome           958 non-null    object
 16  CallStart         4000 non-null   object
 17  CallEnd       

# Preprocessing

In [89]:
def preprocess_inputs(df):
    df=df.copy()
    #dropping the id  and Outcome columns
    df.drop(['Id','Outcome'],axis=1,inplace=True)
     #filling categorical missing values with column mode
    for column in ['Job','Education','Communication']:
        df[column]=df[column].fillna(df[column].mode()[0])
    #Encode duration feature
    df['CallDuration']=(pd.to_datetime(df['CallEnd'])-pd.to_datetime(df['CallStart'])).apply(lambda x:x.seconds)
    #dropping the callstart and callend
    df.drop(['CallStart','CallEnd'],axis=1,inplace=True)
    df['Communication']=df['Communication'].replace({'telephone':0,'cellular':1})
    #ordinal encoding
    df['Education']=df['Education'].replace({'primary':0,'secondary':1,'tertiary':2})
    df['LastContactMonth']=df['LastContactMonth'].replace({'jan':0, 'may':4, 'jun':5, 'mar':2, 'nov':10, 'jul':6, 'aug':7, 'sep':8, 'apr':3,
       'feb':1, 'oct':9, 'dec':11})
    #One-hot encoding
    for column in ['Job','Marital']:
        df=onehot_encode(df,column)
    #Split the data into target and feature 
    y=df['CarInsurance']
    x=df.drop('CarInsurance',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
    x_test= pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)
    #saling x,columns=x_train.column,index=x_train.index)

    return x_train,x_test,y_train,y_test

In [64]:
df['LastContactMonth'].unique()

array(['jan', 'may', 'jun', 'mar', 'nov', 'jul', 'aug', 'sep', 'apr',
       'feb', 'oct', 'dec'], dtype=object)

In [90]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

Unnamed: 0,Age,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,...,Job_management,Job_retired,Job_self-employed,Job_services,Job_student,Job_technician,Job_unemployed,Marital_divorced,Marital_married,Marital_single
1850,0.819191,-1.794479,-0.115721,-0.232386,1.015114,-0.389602,0.269047,1.674151,-0.485851,-0.196439,...,-0.539368,3.738083,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
2293,-0.980287,1.262741,-0.115721,-0.467176,1.015114,2.566725,0.269047,-0.812400,0.297780,-0.196439,...,-0.539368,-0.267517,5.196152,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
576,0.305054,-0.265869,-0.115721,-0.424115,1.015114,2.566725,0.269047,1.555744,-0.485851,0.120764,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
2731,0.990570,1.262741,-0.115721,-0.285504,-0.985111,-0.389602,0.269047,0.134857,1.865043,-0.513642,...,1.854023,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
1051,-0.637530,1.262741,-0.115721,-0.305620,-0.985111,-0.389602,0.269047,-0.575586,0.689596,0.120764,...,1.854023,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,0.047986,-1.794479,-0.115721,-0.428516,-0.985111,-0.389602,0.269047,-1.167622,0.689596,-0.196439,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
1096,-0.380461,-0.265869,-0.115721,-0.436373,1.015114,-0.389602,0.269047,-0.457179,-0.485851,-0.513642,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438
3980,0.647812,-1.794479,-0.115721,-0.441402,1.015114,-0.389602,-3.716829,-0.220364,-0.485851,-0.196439,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,-1.163160,1.521055
235,-0.123393,1.262741,-0.115721,-0.446745,-0.985111,-0.389602,0.269047,0.490079,1.865043,-0.513642,...,-0.539368,-0.267517,-0.192450,-0.294884,-0.176939,-0.44798,-0.18638,-0.374873,0.859727,-0.657438


In [91]:
y_train

1850    0
2293    0
576     0
2731    1
1051    0
       ..
3839    0
1096    0
3980    0
235     0
1061    0
Name: CarInsurance, Length: 2800, dtype: int64

In [81]:
#creating function to one hot encode
def onehot_encode(df,column):
    df=df.copy()
    dummies=pd.get_dummies(df[column],prefix=column)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

# Training the Model

In [95]:
models={'Logistic Regression':LogisticRegression(),
'K-Nearest Neighbors':KNeighborsClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'Support Vector Machine':LinearSVC(),
'Support Vector Machine(RBF Kernal)':SVC(),
'Neutral Network':MLPClassifier(),
'Random Forest':RandomForestClassifier(),
'Gradient Boosting':GradientBoostingClassifier()}


In [96]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'trained.')

Logistic Regressiontrained.
K-Nearest Neighborstrained.
Decision Treetrained.
Support Vector Machinetrained.
Support Vector Machine(RBF Kernal)trained.
Neutral Networktrained.
Random Foresttrained.
Gradient Boostingtrained.


# Results

In [98]:
#getting score for each model
for name,model in models.items():
    print(name+':{:.2f}%'.format(model.score(x_test,y_test)*100))

Logistic Regression:78.58%
K-Nearest Neighbors:73.17%
Decision Tree:77.08%
Support Vector Machine:78.50%
Support Vector Machine(RBF Kernal):78.75%
Neutral Network:78.83%
Random Forest:83.25%
Gradient Boosting:83.42%


In [36]:
#getting percentage of missing values in each column
x.isna().mean()

Age                 0.0
Job                 0.0
Marital             0.0
Education           0.0
Default             0.0
Balance             0.0
HHInsurance         0.0
CarLoan             0.0
Communication       0.0
LastContactDay      0.0
LastContactMonth    0.0
NoOfContacts        0.0
DaysPassed          0.0
PrevAttempts        0.0
CallStart           0.0
CallEnd             0.0
CarInsurance        0.0
dtype: float64

In [38]:
{column:len(x[column].unique()) for column in x.select_dtypes('object').columns}

{'Job': 11,
 'Marital': 3,
 'Education': 3,
 'Communication': 2,
 'LastContactMonth': 12,
 'CallStart': 3777,
 'CallEnd': 3764}

In [47]:
(pd.to_datetime(df['CallEnd'])-pd.to_datetime(df['CallStart'])).apply(lambda x:x.seconds)

0        70
1       185
2       340
3       819
4       192
       ... 
3995    269
3996    125
3997    290
3998     99
3999    274
Length: 4000, dtype: int64

In [52]:
df['Communication'].unique()

array(['telephone', nan, 'cellular'], dtype=object)