In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

In [2]:
bank_data = pd.read_csv('datasets/bank_data_processed.csv')

bank_data.head()

Unnamed: 0,Age,Income,Family,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,34,180,1,3,0,0,0,0,0
1,38,130,4,3,134,0,0,0,0
2,46,193,2,3,0,0,0,0,0
3,38,119,1,2,0,0,1,1,1
4,42,141,3,3,0,1,1,1,0


In [3]:
X = bank_data.drop('CreditCard', axis=1)

Y = bank_data['CreditCard']

#### Splitting dataset in 3 parts

In [5]:
x_train, x_hold_out, x_test  = np.split(X, [int(.7*len(X)), int(.9*len(X))])

y_train, y_hold_out, y_test  = np.split(Y, [int(.7*len(Y)), int(.9*len(Y))])

In [6]:
x_train.shape, x_hold_out.shape, x_test.shape

((336, 8), (96, 8), (48, 8))

In [7]:
y_train.shape, y_hold_out.shape, y_test.shape

((336,), (96,), (48,))

#### In the first layer we use three predictors

In [8]:
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = RandomForestClassifier(n_estimators=50)
clf3 = GaussianNB()

for clf in (clf1, clf2, clf3):
    clf.fit(x_train, y_train)

#### Creating a dataframe in which each column represents predicted values of predictors

In [9]:
def get_predictions(x, y):
    pred_result = pd.DataFrame()
    
    i = 1
    for clf in (clf1, clf2, clf3):

        y_pred = clf.predict(x)
        
        print(clf.__class__.__name__, accuracy_score(y, y_pred))
        
        pred_result.insert(i - 1, 'y_pred_' + str(i), y_pred)
        
        i += 1
        
    return pred_result

In [10]:
pred_result = get_predictions(x_hold_out, y_hold_out)

KNeighborsClassifier 0.7083333333333334
RandomForestClassifier 0.8125
GaussianNB 0.8229166666666666


In [11]:
pred_result.head()

Unnamed: 0,y_pred_1,y_pred_2,y_pred_3
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


Now for each instance in the hold-out set there are three predicted values that will work as input features for training the blender

In [12]:
x_stack_train = pred_result

y_stack_train = y_hold_out

In [13]:
x_stack_train.sample(5)

Unnamed: 0,y_pred_1,y_pred_2,y_pred_3
1,0,0,0
85,0,0,0
23,0,0,0
69,0,0,0
47,0,0,0


In [14]:
y_stack_train.sample(5)

397    1
415    0
404    1
378    1
365    0
Name: CreditCard, dtype: int64

### Use a LogisticRegression classifier as a blender

In [15]:
clf_stack = LogisticRegression(solver='lbfgs', C=1, max_iter=200)

clf_stack.fit(x_stack_train, y_stack_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

We trained the blender now we will use this blender for prediction. So first we will pass the test values in the predictors (first layer) for getting the three predicted values and then using these values as features, we will go for predictions

In [16]:
pred_result_test = get_predictions(x_test, y_test)

KNeighborsClassifier 0.6041666666666666
RandomForestClassifier 0.8333333333333334
GaussianNB 0.8541666666666666


In [17]:
x_stack_test = pred_result_test

y_stack_pred = clf_stack.predict(x_stack_test)

In [18]:
accuracy_score(y_stack_pred, y_test)

0.8333333333333334