In [None]:
import pandas as pd
import numpy as np
from scipy import sparse

%matplotlib notebook
import matplotlib.pyplot as plt

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

from pandas_profiling import ProfileReport

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, Imputer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import classification_report

from sklearn.decomposition import PCA, TruncatedSVD

import category_encoders

## Utils

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    """
    This function plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes) #, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center", 
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.grid('off')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Load data

In [None]:
data = pd.read_csv('data/bank-additional-full.csv.gz', engine='python', sep=';')
data.head()

## EDA

Let's check if there are missing values in target

In [None]:
data.y.isnull().sum()

Check classes balance. Classes are pretty imbalanced.

In [None]:
data.y.value_counts()

In [None]:
data.describe()

Ok, now we could explore data

In [None]:
profile_report = ProfileReport(data)
profile_report

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
continuous_features = profile_report.get_description()['variables'][profile_report.get_description()['variables'].type == 'NUM'].index.values.tolist()

In [None]:
continuous_features

In [None]:
g = sns.pairplot(data[continuous_features + ['y']], hue='y')
handles = g._legend_data.values()
labels = g._legend_data.keys()
g.fig.legend(handles=handles, labels=labels, loc='lower center', ncol=2)
g

In [None]:
for column in continuous_features:
    plt.figure(figsize=(10,5))
    sns.boxplot(x = "y", y = column, data = data)

In [None]:
discrete_features = list(set(profile_report.get_description()['variables'][profile_report.get_description()['variables'].type == 'CAT'].index.values) - {'y'})
discrete_features

In [None]:
for column in discrete_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(x='y', hue=column, data=data)

Great, brief data exploration has been done. let's drop duration feature as it leak from future according to data description https://archive.ics.uci.edu/ml/datasets/Bank+Marketing and go to data processing

In [None]:
data.drop('duration', axis=1, inplace=True)
data.head()

In [None]:
continuous_features = list(set(continuous_features) - {'duration'})

## Data processing

### Train-test creation

In [None]:
X = data.drop('y', axis=1)
X.head().T

In [None]:
y = data['y']
y.head()

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17, stratify=y)
X_train.shape

In [None]:
y_train.shape

### Features preprocessing

Features categorisation

In [None]:
class Quantizer():
    
    def __init__(self, quantiles_num):
        self.quantiles_num = quantiles_num
        
    def fit(self, feature_df):
        self.boundaries = pd.qcut(feature_df.drop_duplicates(), q=self.quantiles_num)
        
    def transform(self, feature_df):
        q_df = pd.cut(feature_df, self.boundaries.cat.categories)
        q_df = q_df.cat.rename_categories(list(range(0, self.quantiles_num)))
        return q_df.astype(str)
    
    def fit_transform(self, feature_df):
        self.fit(feature_df)
        q_df = self.transform(feature_df)
        return q_df

In [None]:
quantizer = Quantizer(5)

In [None]:
q_pdays_train = quantizer.fit_transform(X_train['pdays'])
q_pdays_train.head()

In [None]:
q_pdays_test = quantizer.transform(X_test['pdays'])
q_pdays_test.head()

One-Hot-encoding of categorical features

In [None]:
pipeline = make_pipeline(category_encoders.OneHotEncoder(handle_unknown="ignore"))

X_train_1_1 = pipeline.fit_transform(pd.concat([X_train[discrete_features], q_pdays_train], axis=1))
X_test_1_1 = pipeline.transform(pd.concat([X_test[discrete_features], q_pdays_test], axis=1))

print( X_train_1_1.shape, X_test_1_1.shape)

In [None]:
X_train_1_1.head()

In [None]:
X_train_1_1.index = list(range(X_train_1_1.shape[0]))
X_test_1_1.index = list(range(X_test_1_1.shape[0]))

Scaling

In [None]:
sc = StandardScaler()
X_train_1_2 = pd.DataFrame(sc.fit_transform(X_train[list(set(continuous_features) - {'pdays'})]), columns=list(set(continuous_features) - {'pdays'}))
X_test_1_2 = pd.DataFrame(sc.transform(X_test[list(set(continuous_features) - {'pdays'})]), columns=list(set(continuous_features) - {'pdays'}))
X_train_1_2.head()

In [None]:
X_train_1_2.shape

In [None]:
X_train_1 = pd.concat([X_train_1_1, X_train_1_2], axis=1)
X_train_1.shape

In [None]:
X_test_1 = pd.concat([X_test_1_1, X_test_1_2], axis=1)
X_test_1.shape

In [None]:
y_train = y_train.replace({'yes': 1, 'no': 0})
y_test = y_test.replace({'yes': 1, 'no': 0})
y_train.head()

## Logistic regression

In [None]:
my_scorer = make_scorer(f1_score, average='macro')
skv = StratifiedKFold(5)

In [None]:
%%time

tuned_parameters = [{'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]}]

clf = GridSearchCV(LogisticRegression(random_state=17, class_weight='balanced'), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)

clf.fit(X_train_1.values, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_true, y_pred = y_test, clf.predict(X_test_1.values)
print(classification_report(y_true, y_pred))

In [None]:
class_names = [0, 1]

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

## KNN Classifier

In [None]:
%%time

tuned_parameters = [{'n_neighbors': [ 3, 5, 7], 
                     'weights': ['uniform', 'distance']}]

clf = GridSearchCV( KNeighborsClassifier(), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit( X_train_1, y_train)

In [None]:
%%time
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_1)
print(classification_report(y_true, y_pred))

In [None]:
class_names = [0, 1]

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

## KNN with reduced feature space

In [None]:
tSVD = TruncatedSVD( n_components=X_train_1.shape[1] - 1,  random_state=17).fit(X_train_1.values)

plt.figure(figsize=(10,7))
plt.plot(np.cumsum(tSVD.explained_variance_ratio_), color='k', lw=2)
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
plt.xlim(0, X_train_1.shape[1])
plt.yticks(np.arange(0, 1.1, 0.1))
plt.axhline(0.9, c='r')
plt.show()

In [None]:
tSVD = TruncatedSVD(n_components=21, random_state=17)
X_train_2 = tSVD.fit_transform(X_train_1.values)
X_test_2 = tSVD.transform(X_test_1.values)

In [None]:
%%time

tuned_parameters = [{'n_neighbors': [ 3, 5, 7], 
                     'weights': ['uniform', 'distance']}]

clf = GridSearchCV( KNeighborsClassifier(), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit( X_train_2, y_train)

In [None]:
%%time
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_2)
print(classification_report(y_true, y_pred))

In [None]:
class_names = [0, 1]

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(5, 4))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

## Feature engineering

In [None]:
X_train_3 = X_train_1.copy()
X_test_3 = X_test_1.copy()
print(X_train_3.shape, X_test_3.shape)

In [None]:
numeric_cols = list(X_train_3.select_dtypes(include=['float64']))
numeric_cols

In [None]:
def brute_feat_gen(df, num_col_names):
    for name1 in tqdm_notebook(num_col_names):
        for name2 in num_col_names:
            df[name1 + "_X_" + name2] =  df[name1] * df[name2]
            df[name1 + "_powers3" ] =  df[name1] * df[name1] * df[name1]
    print( df.shape)
    return df

In [None]:
X_train_3_fg = brute_feat_gen(X_train_3, numeric_cols)

In [None]:
X_test_3_fg = brute_feat_gen(X_test_3, numeric_cols)

In [None]:
%%time

tuned_parameters = [{'C':[0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50]}]

clf = GridSearchCV(LogisticRegression(random_state=17, class_weight='balanced'), tuned_parameters,
                   scoring=my_scorer, cv=skv, verbose=2, n_jobs=-1)
clf.fit(X_train_3_fg.values, y_train)

In [None]:
print(clf.best_estimator_)

y_true, y_pred = y_test, clf.predict(X_test_3_fg.values)
print(classification_report(y_true, y_pred))

Previous run of logistic regression

precision    recall  f1-score   support

          0       0.95      0.85      0.90     10961
          1       0.35      0.62      0.45      1392
          /       0.88      0.83      0.85     12353

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

## Try to create your own features