### 0. Some libraries

In [1]:
from time import time
start = time()

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import tracemalloc
tracemalloc.start()

### 1. Data

In [3]:

bank_data = pd.read_csv('bank-additional-full.csv', sep=';')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### 2. Data Preprocessing and Encoding

In [4]:
## Removing Unknowns from data

for col in bank_data.columns:
    bank_data = bank_data[bank_data[col] != 'unknown']

In [5]:
## Encoding similar to Assignment 1
## Binary and Ordinal Encoding to categorical features

bank_data["y"] = bank_data["y"].map({'yes':1, 'no':0})

bank_data["education"] = bank_data["education"].map({'basic.4y':1, 'high.school':4, 'basic.6y':2, 'basic.9y':3,
                                         'professional.course':6, 'university.degree':5,
                                         'illiterate':0})

yes_no = {'yes':0, 'no':1}

bank_data["housing"] = bank_data["housing"].map(yes_no)
bank_data["loan"] = bank_data["loan"].map(yes_no)
bank_data["default"] = bank_data["default"].map(yes_no)

bank_data["marital"] = bank_data["marital"].map({'married':2, 'divorced':0, 'single':1})
bank_data["contact"] = bank_data["contact"].map({'telephone':1, 'cellular':2})

In [6]:
X = bank_data.drop(['job', 'month', 'day_of_week', 'poutcome',
                    'default', 'pdays', 'duration', 'y'], axis = 'columns')


## Frequncy/bin Category encoding to "technical" features


for col in ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']:
    uniq = len(X[col].unique())

    sturges = 1 + round(np.log2(len(X)))

    if uniq < sturges:
        X[col] = pd.cut(X[col], bins=uniq, labels=[j for j in range(uniq)])
    else:
        X[col] = pd.cut(X[col], bins=sturges, labels=[j for j in range(sturges)])

y = bank_data['y']

### 3. Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [8]:
## Stratified test-train split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=88)

In [9]:
## Parameter grid for grid search

param_grid = {
    #'criterion': ['gini', 'entropy'],
    'n_estimators': [j for j in range(350,451,50)],
    'max_depth': [j for j in range(8,13)],
    'min_samples_split': [j for j in range(4,9)]
}

In [10]:
grid = GridSearchCV(RandomForestClassifier(), 
                    param_grid, verbose=1, n_jobs=-1, 
                    refit=True, scoring='average_precision',
                    cv = 5)

In [11]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [12]:
print('Best avg precision:{:.3f}'.format(grid.best_score_))
print('Best parameters:', grid.best_params_)

print('Test avg precision: {:.3f}'.format(grid.score(X_test, y_test)))

Best avg precision:0.431
Best parameters: {'max_depth': 9, 'min_samples_split': 7, 'n_estimators': 450}
Test avg precision: 0.439


In [27]:
## final tree based on grid search

finaltree = RandomForestClassifier(max_depth=9, min_samples_split=7, n_estimators=450)
finaltree.fit(X_train, y_train)

### 4. Adaboost Classifier

In [14]:
from sklearn.metrics import classification_report, average_precision_score
from sklearn.ensemble import AdaBoostClassifier

In [15]:
param_grid = {
    'n_estimators': [j for j in range(150,251,50)],
    'learning_rate': [j/10 for j in range(10,21)]
}

In [16]:
grid = GridSearchCV(AdaBoostClassifier(), 
                    param_grid, verbose=1, n_jobs=-1, 
                    refit=True, scoring='average_precision',
                    cv = 5)

In [17]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


In [18]:
print('Best avg precision:{:.3f}'.format(grid.best_score_))
print('Best parameters:', grid.best_params_)

print('Test avg precision: {:.3f}'.format(grid.score(X_test, y_test)))

Best avg precision:0.405
Best parameters: {'learning_rate': 1.7, 'n_estimators': 250}
Test avg precision: 0.425


In [19]:
boost = AdaBoostClassifier(learning_rate=1.7, n_estimators=250)
boost.fit(X_train, y_train)

### 5. Results

In [28]:
print('AdaBoostClassifier: {:.3f}'.format(average_precision_score(y_test, boost.predict(X_test))))
print('RandomForestClassifier: {:.3f}'.format(average_precision_score(y_test, finaltree.predict(X_test))))


AdaBoostClassifier: 0.218
RandomForestClassifier: 0.240


In [29]:
## Classification report for best estimator

print('RandomForestClassifier')
print(classification_report(y_test, finaltree.predict(X_test)))

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      6657
           1       0.59      0.24      0.34       965

    accuracy                           0.88      7622
   macro avg       0.75      0.61      0.64      7622
weighted avg       0.86      0.88      0.86      7622



### 6. Space and Time Complexity

In [22]:
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print("Memory used")
print("Current [MB]", round(current/(1024*1024), 2))
print("Peak [MB]", round(peak//(1024*1024), 2))

Memory used
Current [MB] 28.17
Peak [MB] 32


In [24]:
print("Time elapsed [s] ", round(time() - start, 2))

Time elapsed [s]  322.48
