# Predictive Modeling Part2

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [40]:
caravan = pd.read_csv('Caravan.csv')
caravan.shape

(5822, 86)

In [41]:
caravan.iloc[:5,-13:]

Unnamed: 0,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,0,0,0,0,0,0,1,0,0,0,0,0,No
1,0,0,0,0,0,0,1,0,0,0,0,0,No
2,0,0,0,0,0,0,1,0,0,0,0,0,No
3,0,0,0,0,0,0,1,0,0,0,0,0,No
4,0,0,0,0,0,0,1,0,0,0,0,0,No


### PVRAAUT and AVRAAUT are highly unbalanced

In [42]:
caravan['PVRAAUT'].value_counts()

0    5813
6       7
4       1
9       1
Name: PVRAAUT, dtype: int64

In [43]:
caravan.pivot_table(values='Purchase',index='PVRAAUT',aggfunc='count')

Unnamed: 0_level_0,Purchase
PVRAAUT,Unnamed: 1_level_1
0,5813
4,1
6,7
9,1


In [44]:
caravan['AVRAAUT'].value_counts()

0    5813
1       6
2       2
3       1
Name: AVRAAUT, dtype: int64

In [45]:
caravan_1 = caravan.drop(['PVRAAUT','AVRAAUT'],axis=1)

In [46]:
caravan['Purchase'].value_counts()/5822

No     0.940227
Yes    0.059773
Name: Purchase, dtype: float64

In [47]:
# This is an imbalanced dataset

### Split train, test sets

In [48]:
train = caravan_1[:1000]
test = caravan_1[1000:]

In [49]:
X_train = train.drop('Purchase',axis=1)
y_train = train['Purchase']
X_test = test.drop('Purchase',axis=1)
y_test = test['Purchase']

In [50]:
y_train.value_counts()/1000

No     0.941
Yes    0.059
Name: Purchase, dtype: float64

### Random Forest Classifier

In [51]:
random_forest_purchase = RandomForestClassifier(max_features = 29,
                                            n_estimators = 500,
                                            random_state = 1)
random_forest_purchase.fit(X_train,y_train)
ypred = random_forest_purchase.predict(X_test)

In [52]:
# 5 Most Important predictors

In [53]:
importance = random_forest_purchase.feature_importances_*100
random_forest_purchase_imp = pd.DataFrame({'Importance':importance},
                                          index = X_train.columns)
random_forest_purchase_imp.sort_values(by='Importance',
                                       ascending = False)[:9]

Unnamed: 0,Importance
PPERSAUT,3.426791
MOSTYPE,3.333823
MGODGE,3.302695
MOPLHOOG,3.024537
PBRAND,2.804189
MGODPR,2.65909
MKOOPKLA,2.606417
ABRAND,2.326212
MBERMIDD,2.311773


In [54]:
# Test accuracy rate

In [55]:
ypred = random_forest_purchase.predict(X_test)
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4481,52
Yes,277,12


In [56]:
# Test accuracy rate

In [57]:
random_forest_purchase.score(X_test,y_test)

0.9317710493571132

In [59]:
# Gridsearching is needed

### Gradient Boosting Classifier

In [60]:
boosted_purchase = GradientBoostingClassifier(n_estimators = 1000, 
                                              learning_rate = 0.01,
                                              random_state =1)
boosted_purchase.fit(X_train,y_train)
ypred = boosted_purchase.predict(X_test)

In [61]:
# Importance

In [62]:
importance = boosted_purchase.feature_importances_*100
boosted_purchase_imp = pd.DataFrame({'Importance':importance},
                                    index = X_train.columns)
boosted_purchase_imp.sort_values(by='Importance',ascending = False)[:9]

Unnamed: 0,Importance
PPERSAUT,7.616748
MOSTYPE,6.318294
ABRAND,5.767607
MGODGE,5.263508
MKOOPKLA,4.817801
MOPLHOOG,4.774052
MGODPR,3.446326
MBERMIDD,3.438051
PPLEZIER,3.298615


In [63]:
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4491,42
Yes,274,15


In [64]:
# Test accuracy rate

In [65]:
boosted_purchase.score(X_test,y_test)

0.9344670261302365

In [66]:
# Gridsearching is needed

### KNeighbors Classifier

In [89]:
kfold = StratifiedKFold (n_splits = 5, random_state=1, shuffle=True)

In [90]:
# Scaling the data

In [91]:
arates = []
for k in range(1,15):
    scaler = MinMaxScaler()
    model1 = KNeighborsClassifier(n_neighbors=k)
    pipe1 = Pipeline([('transformer1', scaler),('estimator1', model1)])
    scores = cross_val_score(pipe1,X_train,y_train,cv=kfold)
    arates.append(scores.mean())

In [92]:
max_rate = max(arates)
max_rate

0.9410000000000001

In [93]:
arates.index(max(arates))

5

In [94]:
arates[:9]

[0.8960000000000001,
 0.93,
 0.9259999999999999,
 0.9359999999999999,
 0.9369999999999999,
 0.9410000000000001,
 0.9410000000000001,
 0.9410000000000001,
 0.9410000000000001]

In [95]:
# K-Fold cv selects k=6 neighbors

In [96]:
knn_purchase = KNeighborsClassifier(n_neighbors = 6)
knn_purchase.fit(X_train,y_train)
ypred = knn_purchase.predict(X_test)
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4525,8
Yes,287,2


In [97]:
# Test accuracy rate

In [98]:
knn_purchase.score(X_test,y_test)

0.9388220655329739

In [99]:
# with GridSearchCV and scaling

In [100]:
from sklearn.model_selection import GridSearchCV

In [101]:
scaler = MinMaxScaler()
params = {'estimator__n_neighbors': list(range(1,12))}
model2 = KNeighborsClassifier()
pipe2 = Pipeline([('transformer', scaler), ('estimator', model2)])
grid2 = GridSearchCV(pipe2,param_grid = params, cv = kfold)
grid2.fit(X_train,y_train)
grid2.score(X_test,y_test)

0.9400663625051846

In [102]:
grid2.best_params_

{'estimator__n_neighbors': 6}

In [103]:
best_model = grid2.best_estimator_
ypred = best_model.predict(X_test)
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4532,1
Yes,288,1


In [104]:
4533/4822

0.9400663625051846

### Logistic Regression

By hot including C in LogisticRegression( ) we
use the default C=1.0 which implies some degree of regularization

In [105]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [106]:
Logi_purchase = LogisticRegression(solver = 'lbfgs',max_iter = 5000,
                                   random_state = 1)
Logi_purchase.fit(X_train_scaled,y_train)
ypred = Logi_purchase.predict(X_test_scaled)

In [107]:
pd.crosstab(y_test,ypred,rownames=['y_test'],colnames=['ypred'])

ypred,No,Yes
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4532,1
Yes,288,1


In [108]:
# Test accuracy rate

In [109]:
Logi_purchase.score(X_test_scaled,y_test)

0.9400663625051846