In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('Caravan.csv')
df.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No


In [4]:
df.pivot_table(index = ['PVRAAUT'], values = ['Purchase'],aggfunc = 'count')

Unnamed: 0_level_0,Purchase
PVRAAUT,Unnamed: 1_level_1
0,5813
4,1
6,7
9,1


In [5]:
df.pivot_table(index = ['AVRAAUT'], values = ['Purchase'],aggfunc = 'count')

Unnamed: 0_level_0,Purchase
AVRAAUT,Unnamed: 1_level_1
0,5813
1,6
2,2
3,1


In [6]:
df1 = df.drop(['PVRAAUT','AVRAAUT'], axis = 1)
df1[:5]

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No


In [7]:
#PVRAAUT and AVRAAUT are dropped due to highly unbalanced. 

Fit a random forest model with 500 trees and max_features = 29 to the training set 
<br>Response: Purchase
<br>Predictors: Other variables

In [8]:
y = df1.Purchase
X = df1.drop('Purchase', axis = 1)

In [9]:
X_train = X[0:1000]
X_test = X[1000:]
y_train = y[0:1000]
y_test = y[1000:]

In [10]:
X_train.shape, X_test.shape

((1000, 83), (4822, 83))

In [11]:
forest = RandomForestClassifier(max_features = 29,
                                n_estimators = 500,
                                random_state = 1)
forest.fit(X_train,y_train)
forest.score(X_test, y_test)

0.9317710493571132

In [12]:
Importance = pd.DataFrame({'Importance':forest.feature_importances_*100},
                          index = X_train.columns)
df_imp = Importance.sort_values(by = 'Importance',axis = 0,
                             ascending = False)
df_imp.index[0]

'PPERSAUT'

In [None]:
#PPERSAUT is the most important predictors with 0.932 test accuracy rate.

Fit a boosting model to the training set with max_depth = 4  
Response: Purchase
<br>Predictors: Other variables

In [13]:
boost = GradientBoostingClassifier(n_estimators = 1000, 
                                    learning_rate = 0.01,
                                    max_depth = 4, 
                                    random_state =1)
boost.fit(X_train,y_train)
boost.score(X_test, y_test)

0.9332227291580257

In [14]:
Importance2 = boost.feature_importances_*100
Importance2 = pd.DataFrame({'Importance':Importance2},index = X.columns)
df_imp2 = Importance2.sort_values(by = 'Importance',axis = 0,
                             ascending = False)
df_imp2.index[0]

'PPERSAUT'

In [15]:
#PPERSAUT is the most important predictors with 0.933 test accuracy rate.

Use KNN to predict Purchase using 5-fold cross validation

In [16]:
kfold = StratifiedKFold(n_splits = 5,shuffle = True,random_state=1)

In [20]:
acc_rate = []
for k in range(1,15):
    scaler = MinMaxScaler()
    model1 =  KNeighborsClassifier(n_neighbors = k)
    pipe1 = Pipeline([('transformer1', scaler), ('estimator1', model1)])
    scores = cross_val_score(pipe1,X_test,y_test,cv=kfold)
    acc_rate.append(scores.mean())

In [18]:
max(acc_rate)

0.9400664330402254

In [19]:
acc_rate.index(max(acc_rate))

11

Use Logistic Regression to predict Purchase using 5-fold cross validation

In [23]:
model1 = LogisticRegression(max_iter = 1000, random_state = 1)
model1.fit(X_train,y_train)
yhat = model1.predict(X_test)
model1.score(X_test,y_test)

0.9346744089589382

In [24]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
Logi_purchase = LogisticRegression(solver = 'lbfgs',max_iter = 5000,
                                   random_state = 1)
Logi_purchase.fit(X_train_scaled,y_train)
ypred = Logi_purchase.predict(X_test_scaled)

In [26]:
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4532,1
Yes,288,1


In [28]:
#Test Accuracy Rate

In [27]:
Logi_purchase.score(X_test_scaled,y_test)

0.9400663625051846