In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

%matplotlib notebook
import matplotlib.pyplot as plt

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

from pandas_profiling import ProfileReport

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report

from sklearn.decomposition import PCA, TruncatedSVD

import category_encoders

## Utils

In [3]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    """
    This function plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes) #, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", 
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.grid('off')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Load data

In [4]:
data = pd.read_csv('data/bank-additional-full.csv.gz', engine='python', sep=';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## EDA

Let's check if there are missing values in target

In [5]:
data.y.isnull().sum()

0

Check classes balance. Classes are pretty imbalanced.

In [6]:
data.y.value_counts()

no     36548
yes     4640
Name: y, dtype: int64

In [7]:
data.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


Ok, now we could explore data

In [8]:
profile_report = ProfileReport(data)
profile_report

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0,1
Number of variables,21
Number of observations,41188
Total Missing (%),0.0%
Total size in memory,6.6 MiB
Average record size in memory,168.0 B

0,1
Numeric,8
Categorical,11
Boolean,0
Date,0
Text (Unique),0
Rejected,2
Unsupported,0

0,1
Distinct count,78
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,40.024
Minimum,17
Maximum,98
Zeros (%),0.0%

0,1
Minimum,17
5-th percentile,26
Q1,32
Median,38
Q3,47
95-th percentile,58
Maximum,98
Range,81
Interquartile range,15

0,1
Standard deviation,10.421
Coef of variation,0.26037
Kurtosis,0.79131
Mean,40.024
MAD,8.4615
Skewness,0.7847
Sum,1648511
Variance,108.6
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
31,1947,4.7%,
32,1846,4.5%,
33,1833,4.5%,
36,1780,4.3%,
35,1759,4.3%,
34,1745,4.2%,
30,1714,4.2%,
37,1475,3.6%,
29,1453,3.5%,
39,1432,3.5%,

Value,Count,Frequency (%),Unnamed: 3
17,5,0.0%,
18,28,0.1%,
19,42,0.1%,
20,65,0.2%,
21,102,0.2%,

Value,Count,Frequency (%),Unnamed: 3
91,2,0.0%,
92,4,0.0%,
94,1,0.0%,
95,1,0.0%,
98,2,0.0%,

0,1
Distinct count,42
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.5676
Minimum,1
Maximum,56
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,2
Q3,3
95-th percentile,7
Maximum,56
Range,55
Interquartile range,2

0,1
Standard deviation,2.77
Coef of variation,1.0788
Kurtosis,36.98
Mean,2.5676
MAD,1.6342
Skewness,4.7625
Sum,105754
Variance,7.673
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
1,17642,42.8%,
2,10570,25.7%,
3,5341,13.0%,
4,2651,6.4%,
5,1599,3.9%,
6,979,2.4%,
7,629,1.5%,
8,400,1.0%,
9,283,0.7%,
10,225,0.5%,

Value,Count,Frequency (%),Unnamed: 3
1,17642,42.8%,
2,10570,25.7%,
3,5341,13.0%,
4,2651,6.4%,
5,1599,3.9%,

Value,Count,Frequency (%),Unnamed: 3
40,2,0.0%,
41,1,0.0%,
42,2,0.0%,
43,2,0.0%,
56,1,0.0%,

0,1
Distinct count,26
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-40.503
Minimum,-50.8
Maximum,-26.9
Zeros (%),0.0%

0,1
Minimum,-50.8
5-th percentile,-47.1
Q1,-42.7
Median,-41.8
Q3,-36.4
95-th percentile,-33.6
Maximum,-26.9
Range,23.9
Interquartile range,6.3

0,1
Standard deviation,4.6282
Coef of variation,-0.11427
Kurtosis,-0.35856
Mean,-40.503
MAD,3.9383
Skewness,0.30318
Sum,-1668200
Variance,21.42
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
-36.4,7763,18.8%,
-42.7,6685,16.2%,
-46.2,5794,14.1%,
-36.1,5175,12.6%,
-41.8,4374,10.6%,
-42.0,3616,8.8%,
-47.1,2458,6.0%,
-31.4,770,1.9%,
-40.8,715,1.7%,
-26.9,447,1.1%,

Value,Count,Frequency (%),Unnamed: 3
-50.8,128,0.3%,
-50.0,282,0.7%,
-49.5,204,0.5%,
-47.1,2458,6.0%,
-46.2,5794,14.1%,

Value,Count,Frequency (%),Unnamed: 3
-33.0,172,0.4%,
-31.4,770,1.9%,
-30.1,357,0.9%,
-29.8,267,0.6%,
-26.9,447,1.1%,

0,1
Distinct count,26
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,93.576
Minimum,92.201
Maximum,94.767
Zeros (%),0.0%

0,1
Minimum,92.201
5-th percentile,92.713
Q1,93.075
Median,93.749
Q3,93.994
95-th percentile,94.465
Maximum,94.767
Range,2.566
Interquartile range,0.919

0,1
Standard deviation,0.57884
Coef of variation,0.0061858
Kurtosis,-0.82981
Mean,93.576
MAD,0.50981
Skewness,-0.23089
Sum,3854200
Variance,0.33506
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
93.994,7763,18.8%,
93.91799999999999,6685,16.2%,
92.89299999999999,5794,14.1%,
93.444,5175,12.6%,
94.465,4374,10.6%,
93.2,3616,8.8%,
93.075,2458,6.0%,
92.20100000000001,770,1.9%,
92.963,715,1.7%,
92.431,447,1.1%,

Value,Count,Frequency (%),Unnamed: 3
92.201,770,1.9%,
92.379,267,0.6%,
92.431,447,1.1%,
92.469,178,0.4%,
92.649,357,0.9%,

Value,Count,Frequency (%),Unnamed: 3
94.199,303,0.7%,
94.215,311,0.8%,
94.465,4374,10.6%,
94.601,204,0.5%,
94.767,128,0.3%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
cellular,26144
telephone,15044

Value,Count,Frequency (%),Unnamed: 3
cellular,26144,63.5%,
telephone,15044,36.5%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
thu,8623
mon,8514
wed,8134
Other values (2),15917

Value,Count,Frequency (%),Unnamed: 3
thu,8623,20.9%,
mon,8514,20.7%,
wed,8134,19.7%,
tue,8090,19.6%,
fri,7827,19.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
no,32588
unknown,8597
yes,3

Value,Count,Frequency (%),Unnamed: 3
no,32588,79.1%,
unknown,8597,20.9%,
yes,3,0.0%,

0,1
Distinct count,1544
Unique (%),3.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,258.29
Minimum,0
Maximum,4918
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,36.0
Q1,102.0
Median,180.0
Q3,319.0
95-th percentile,752.65
Maximum,4918.0
Range,4918.0
Interquartile range,217.0

0,1
Standard deviation,259.28
Coef of variation,1.0038
Kurtosis,20.248
Mean,258.29
MAD,171.67
Skewness,3.2631
Sum,10638243
Variance,67226
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
85,170,0.4%,
90,170,0.4%,
136,168,0.4%,
73,167,0.4%,
124,164,0.4%,
87,162,0.4%,
72,161,0.4%,
104,161,0.4%,
111,160,0.4%,
106,159,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0,4,0.0%,
1,3,0.0%,
2,1,0.0%,
3,3,0.0%,
4,12,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3631,1,0.0%,
3643,1,0.0%,
3785,1,0.0%,
4199,1,0.0%,
4918,1,0.0%,

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
university.degree,12168
high.school,9515
basic.9y,6045
Other values (5),13460

Value,Count,Frequency (%),Unnamed: 3
university.degree,12168,29.5%,
high.school,9515,23.1%,
basic.9y,6045,14.7%,
professional.course,5243,12.7%,
basic.4y,4176,10.1%,
basic.6y,2292,5.6%,
unknown,1731,4.2%,
illiterate,18,0.0%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.081886
Minimum,-3.4
Maximum,1.4
Zeros (%),0.0%

0,1
Minimum,-3.4
5-th percentile,-2.9
Q1,-1.8
Median,1.1
Q3,1.4
95-th percentile,1.4
Maximum,1.4
Range,4.8
Interquartile range,3.2

0,1
Standard deviation,1.571
Coef of variation,19.185
Kurtosis,-1.0626
Mean,0.081886
MAD,1.4228
Skewness,-0.7241
Sum,3372.7
Variance,2.4679
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
1.4,16234,39.4%,
-1.8,9184,22.3%,
1.1,7763,18.8%,
-0.1,3683,8.9%,
-2.9,1663,4.0%,
-3.4,1071,2.6%,
-1.7,773,1.9%,
-1.1,635,1.5%,
-3.0,172,0.4%,
-0.2,10,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-3.4,1071,2.6%,
-3.0,172,0.4%,
-2.9,1663,4.0%,
-1.8,9184,22.3%,
-1.7,773,1.9%,

Value,Count,Frequency (%),Unnamed: 3
-1.1,635,1.5%,
-0.2,10,0.0%,
-0.1,3683,8.9%,
1.1,7763,18.8%,
1.4,16234,39.4%,

0,1
Correlation,0.97224

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
yes,21576
no,18622
unknown,990

Value,Count,Frequency (%),Unnamed: 3
yes,21576,52.4%,
no,18622,45.2%,
unknown,990,2.4%,

0,1
Distinct count,12
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
admin.,10422
blue-collar,9254
technician,6743
Other values (9),14769

Value,Count,Frequency (%),Unnamed: 3
admin.,10422,25.3%,
blue-collar,9254,22.5%,
technician,6743,16.4%,
services,3969,9.6%,
management,2924,7.1%,
retired,1720,4.2%,
entrepreneur,1456,3.5%,
self-employed,1421,3.5%,
housemaid,1060,2.6%,
unemployed,1014,2.5%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
no,33950
yes,6248
unknown,990

Value,Count,Frequency (%),Unnamed: 3
no,33950,82.4%,
yes,6248,15.2%,
unknown,990,2.4%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
married,24928
single,11568
divorced,4612

Value,Count,Frequency (%),Unnamed: 3
married,24928,60.5%,
single,11568,28.1%,
divorced,4612,11.2%,
unknown,80,0.2%,

0,1
Distinct count,10
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
may,13769
jul,7174
aug,6178
Other values (7),14067

Value,Count,Frequency (%),Unnamed: 3
may,13769,33.4%,
jul,7174,17.4%,
aug,6178,15.0%,
jun,5318,12.9%,
nov,4101,10.0%,
apr,2632,6.4%,
oct,718,1.7%,
sep,570,1.4%,
mar,546,1.3%,
dec,182,0.4%,

0,1
Correlation,0.94515

0,1
Distinct count,27
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,962.48
Minimum,0
Maximum,999
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,999
Q1,999
Median,999
Q3,999
95-th percentile,999
Maximum,999
Range,999
Interquartile range,0

0,1
Standard deviation,186.91
Coef of variation,0.1942
Kurtosis,22.229
Mean,962.48
MAD,70.362
Skewness,-4.9222
Sum,39642439
Variance,34936
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
999,39673,96.3%,
3,439,1.1%,
6,412,1.0%,
4,118,0.3%,
9,64,0.2%,
2,61,0.1%,
7,60,0.1%,
12,58,0.1%,
10,52,0.1%,
5,46,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,15,0.0%,
1,26,0.1%,
2,61,0.1%,
3,439,1.1%,
4,118,0.3%,

Value,Count,Frequency (%),Unnamed: 3
22,3,0.0%,
25,1,0.0%,
26,1,0.0%,
27,1,0.0%,
999,39673,96.3%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
nonexistent,35563
failure,4252
success,1373

Value,Count,Frequency (%),Unnamed: 3
nonexistent,35563,86.3%,
failure,4252,10.3%,
success,1373,3.3%,

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.17296
Minimum,0
Maximum,7
Zeros (%),86.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,7
Range,7
Interquartile range,0

0,1
Standard deviation,0.4949
Coef of variation,2.8613
Kurtosis,20.109
Mean,0.17296
MAD,0.29868
Skewness,3.832
Sum,7124
Variance,0.24493
Memory size,321.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,35563,86.3%,
1,4561,11.1%,
2,754,1.8%,
3,216,0.5%,
4,70,0.2%,
5,18,0.0%,
6,5,0.0%,
7,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,35563,86.3%,
1,4561,11.1%,
2,754,1.8%,
3,216,0.5%,
4,70,0.2%,

Value,Count,Frequency (%),Unnamed: 3
3,216,0.5%,
4,70,0.2%,
5,18,0.0%,
6,5,0.0%,
7,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
no,36548
yes,4640

Value,Count,Frequency (%),Unnamed: 3
no,36548,88.7%,
yes,4640,11.3%,

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [9]:
data.drop_duplicates(inplace=True)

In [10]:
continuous_features = profile_report.get_description()['variables'][profile_report.get_description()['variables'].type == 'NUM'].index.values.tolist()

In [11]:
continuous_features

['age',
 'campaign',
 'cons.conf.idx',
 'cons.price.idx',
 'duration',
 'emp.var.rate',
 'pdays',
 'previous']

In [12]:
g = sns.pairplot(data[continuous_features + ['y']], hue='y')
handles = g._legend_data.values()
labels = g._legend_data.keys()
g.fig.legend(handles=handles, labels=labels, loc='lower center', ncol=2)
g

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x7f12d5d36438>

In [13]:
for column in continuous_features:
    plt.figure(figsize=(10,5))
    sns.boxplot(x = "y", y = column, data = data)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
discrete_features = list(set(profile_report.get_description()['variables'][profile_report.get_description()['variables'].type == 'CAT'].index.values) - {'y'})
discrete_features

['marital',
 'poutcome',
 'housing',
 'education',
 'month',
 'loan',
 'default',
 'day_of_week',
 'contact',
 'job']

In [15]:
for column in discrete_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x='y', hue=column, data=data)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Great, brief data exploration has been done. let's drop duration feature as it leak from future according to data description https://archive.ics.uci.edu/ml/datasets/Bank+Marketing and go to data processing

In [16]:
data.drop('duration', axis=1, inplace=True)
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [17]:
continuous_features = list(set(continuous_features) - {'duration'})

## Data processing

### Train-test creation

In [18]:
X = data.drop('y', axis=1)
X.head().T

Unnamed: 0,0,1,2,3,4
age,56,57,37,40,56
job,housemaid,services,services,admin.,services
marital,married,married,married,married,married
education,basic.4y,high.school,high.school,basic.6y,high.school
default,no,unknown,no,no,no
housing,no,no,yes,no,no
loan,no,no,no,no,yes
contact,telephone,telephone,telephone,telephone,telephone
month,may,may,may,may,may
day_of_week,mon,mon,mon,mon,mon


In [19]:
y = data['y']
y.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [20]:
X.shape, y.shape

((41176, 19), (41176,))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17, stratify=y)
X_train.shape

(28823, 19)

In [22]:
y_train.shape

(28823,)

### Features preprocessing

Features categorisation

In [23]:
class Quantizer():
    
    def __init__(self, quantiles_num):
        self.quantiles_num = quantiles_num
        
    def fit(self, feature_df):
        self.boundaries = pd.qcut(feature_df.drop_duplicates(), q=self.quantiles_num)
        
    def transform(self, feature_df):
        q_df = pd.cut(feature_df, self.boundaries.cat.categories)
        q_df = q_df.cat.rename_categories(list(range(0, self.quantiles_num)))
        return q_df.astype(str)
    
    def fit_transform(self, feature_df):
        self.fit(feature_df)
        q_df = self.transform(feature_df)
        return q_df

In [24]:
quantizer = Quantizer(5)

In [25]:
q_pdays_train = quantizer.fit_transform(X_train['pdays'])
q_pdays_train.head()

23917    4
5604     4
30721    4
13336    4
30805    4
Name: pdays, dtype: object

In [26]:
q_pdays_test = quantizer.transform(X_test['pdays'])
q_pdays_test.head()

13107    4
25072    4
21982    4
25292    4
27761    4
Name: pdays, dtype: object

One-Hot-encoding of categorical features

In [27]:
pipeline = make_pipeline(category_encoders.OneHotEncoder(handle_unknown="ignore"))

X_train_1_1 = pipeline.fit_transform(pd.concat([X_train[discrete_features], q_pdays_train], axis=1))
X_test_1_1 = pipeline.transform(pd.concat([X_test[discrete_features], q_pdays_test], axis=1))

print( X_train_1_1.shape, X_test_1_1.shape)

(28823, 58) (12353, 58)


In [28]:
X_train_1_1.head()

Unnamed: 0,marital_1,marital_2,marital_3,marital_4,poutcome_1,poutcome_2,poutcome_3,housing_1,housing_2,housing_3,...,job_8,job_9,job_10,job_11,job_12,pdays_1,pdays_2,pdays_3,pdays_4,pdays_5
23917,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5604,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
30721,1,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
13336,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
30805,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [29]:
X_train_1_1.index = list(range(X_train_1_1.shape[0]))
X_test_1_1.index = list(range(X_test_1_1.shape[0]))

Scaling

In [30]:
sc = StandardScaler()
X_train_1_2 = pd.DataFrame(sc.fit_transform(X_train[list(set(continuous_features) - {'pdays'})]), columns=list(set(continuous_features) - {'pdays'}))
X_test_1_2 = pd.DataFrame(sc.transform(X_test[list(set(continuous_features) - {'pdays'})]), columns=list(set(continuous_features) - {'pdays'}))
X_train_1_2.head()

Unnamed: 0,cons.price.idx,age,previous,campaign,cons.conf.idx,emp.var.rate
0,-0.224528,0.091889,-0.350333,1.597641,0.954882,0.84292
1,0.725049,-1.53921,-0.350333,-0.565104,0.890065,0.651906
2,-1.17583,0.859465,1.645249,-0.565104,-1.227291,-1.194565
3,0.593834,-1.251369,-0.350333,1.237184,-0.471092,0.84292
4,-1.17583,0.763518,-0.350333,-0.565104,-1.227291,-1.194565


In [31]:
X_train_1_2.shape

(28823, 6)

In [61]:
X_train_1 = pd.concat([X_train_1_1, X_train_1_2], axis=1)
X_train_1.shape

(28823, 64)

In [62]:
X_test_1 = pd.concat([X_test_1_1, X_test_1_2], axis=1)
X_test_1.shape

(12353, 64)

In [34]:
y_train = y_train.replace({'yes': 1, 'no': 0})
y_test = y_test.replace({'yes': 1, 'no': 0})
y_train.head()

23917    0
5604     0
30721    1
13336    0
30805    0
Name: y, dtype: int64

## Logistic regression

In [35]:
my_scorer = make_scorer(f1_score, average='macro')
skv = StratifiedKFold(5)

In [36]:
%%time

tuned_parameters = [{'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]}]

clf = GridSearchCV(LogisticRegression(random_state=17, class_weight='balanced'), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)

clf.fit(X_train_1.values, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   0.2s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   0.3s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   0.3s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   0.4s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   0.3s
[CV] C=0.05 .....

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.0s


[CV] ............................................. C=50, total=   0.4s
[CV] ............................................. C=10, total=   0.7s
[CV] C=50 ............................................................
[CV] C=50 ............................................................
[CV] ............................................. C=50, total=   0.8s
[CV] ............................................. C=50, total=   0.8s
[CV] ............................................. C=50, total=   0.7s
[CV] ............................................. C=50, total=   0.6s


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.1s finished


CPU times: user 1.91 s, sys: 70.2 ms, total: 1.98 s
Wall time: 6.8 s


In [37]:
clf.best_estimator_

LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [38]:
y_true, y_pred = y_test, clf.predict(X_test_1.values)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

          0       0.95      0.85      0.90     10961
          1       0.35      0.62      0.45      1392

avg / total       0.88      0.83      0.85     12353



In [39]:
class_names = [0, 1]

In [40]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## KNN Classifier

In [41]:
%%time

tuned_parameters = [{'n_neighbors': [ 3, 5, 7], 
                     'weights': ['uniform', 'distance']}]

clf = GridSearchCV( KNeighborsClassifier(), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit( X_train_1, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] ................... weights=uniform, n_neighbors=3, total=   8.8s
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] ................... weights=uniform, n_neighbors=3, total=   8.9s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   8.8s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   9.0s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   9.3s
[CV] ............

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.9min finished


CPU times: user 1.96 s, sys: 270 ms, total: 2.23 s
Wall time: 5min 54s


In [43]:
%%time
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_1)
print(classification_report(y_true, y_pred))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.91      0.97      0.94     10961
          1       0.52      0.28      0.36      1392

avg / total       0.87      0.89      0.87     12353

CPU times: user 12.2 s, sys: 0 ns, total: 12.2 s
Wall time: 12.2 s


In [44]:
class_names = [0, 1]

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## KNN with reduced feature space

In [48]:
tSVD = TruncatedSVD( n_components=X_train_1.shape[1] - 1,  random_state=17).fit(X_train_1.values)

plt.figure(figsize=(10,7))
plt.plot(np.cumsum(tSVD.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
plt.xlim(0, X_train_1.shape[1])
plt.yticks(np.arange(0, 1.1, 0.1))
plt.axhline(0.9, c='r')
plt.show()

<IPython.core.display.Javascript object>

In [49]:
tSVD = TruncatedSVD(n_components=21, random_state=17)
X_train_2 = tSVD.fit_transform(X_train_1.values)
X_test_2 = tSVD.transform(X_test_1.values)

In [50]:
%%time

tuned_parameters = [{'n_neighbors': [ 3, 5, 7], 
                     'weights': ['uniform', 'distance']}]

clf = GridSearchCV( KNeighborsClassifier(), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit( X_train_2, y_train)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:32815)
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/py4j-0.10.6-py3.5.egg/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/py4j-0.10.6-py3.5.egg/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] ................... weights=uniform, n_neighbors=3, total=   1.7s
[CV] weights=uniform, n_neighbors=3 ..................................
[CV] ................... weights=uniform, n_neighbors=3, total=   1.7s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   1.7s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   1.7s
[CV] weights=distance, n_neighbors=3 .................................
[CV] ................... weights=uniform, n_neighbors=3, total=   1.7s
[CV] weights=dist

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.2min finished


In [52]:
%%time
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_2)
print(classification_report(y_true, y_pred))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.91      0.96      0.93     10961
          1       0.43      0.26      0.33      1392

avg / total       0.86      0.88      0.86     12353

CPU times: user 2.34 s, sys: 0 ns, total: 2.34 s
Wall time: 2.34 s


In [53]:
class_names = [0, 1]

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Feature engineering

In [77]:
X_train_3 = X_train_1.copy()
X_test_3 = X_test_1.copy()
print(X_train_3.shape, X_test_3.shape)

(28823, 64) (12353, 64)


In [78]:
numeric_cols = list(X_train_3.select_dtypes(include=['float64']))
numeric_cols

['cons.price.idx',
 'age',
 'previous',
 'campaign',
 'cons.conf.idx',
 'emp.var.rate']

In [79]:
def brute_feat_gen(df, num_col_names):
    for name1 in tqdm_notebook(num_col_names):
        for name2 in num_col_names:
            df[name1 + "_X_" + name2] =  df[name1] * df[name2]
            df[name1 + "_powers3" ] =  df[name1] * df[name1] * df[name1]
    print( df.shape)
    return df

In [80]:
X_train_3_fg = brute_feat_gen(X_train_3, numeric_cols)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


(28823, 106)


In [81]:
X_test_3_fg = brute_feat_gen(X_test_3, numeric_cols)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


(12353, 106)


In [83]:
%%time

tuned_parameters = [{'C':[0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]}]

clf = GridSearchCV(LogisticRegression(random_state=17, class_weight='balanced'), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit(X_train_3_fg.values, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   1.5s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   1.5s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   1.6s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   1.7s
[CV] C=0.05 ..........................................................
[CV] ........................................... C=0.01, total=   1.6s
[CV] C=0.05 .....

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   35.8s


[CV] ............................................. C=10, total=   7.6s
[CV] C=50 ............................................................
[CV] ............................................. C=10, total=   7.9s
[CV] C=50 ............................................................
[CV] ............................................. C=50, total=   5.4s
[CV] C=50 ............................................................
[CV] ............................................. C=50, total=   8.5s
[CV] ............................................. C=50, total=   4.8s
[CV] ............................................. C=50, total=   6.2s
[CV] ............................................. C=50, total=   8.0s


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   48.2s finished


CPU times: user 3.74 s, sys: 175 ms, total: 3.92 s
Wall time: 49.9 s


In [84]:
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_3_fg.values)
print(classification_report(y_true, y_pred))

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
             precision    recall  f1-score   support

          0       0.95      0.85      0.90     10961
          1       0.35      0.63      0.45      1392

avg / total       0.88      0.83      0.85     12353



Previous run of logistic regression

precision    recall  f1-score   support

          0       0.95      0.85      0.90     10961
          1       0.35      0.62      0.45      1392
          /       0.88      0.83      0.85     12353

In [85]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Try to create your own features