In [4]:
import fcalc

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score

Reading data

In [5]:
all_data = pd.read_csv('Bank Marketing.csv')
all_data.drop(columns = ['contact', 'day_of_week', 'month', 'duration', 'pdays', 'previous', 'poutcome'], inplace = True)
all_data.dropna(inplace = True)
all_data.loc[all_data['y']=='no', 'y'] = 0
all_data.loc[all_data['y']=='yes', 'y'] = 1
all_data['y'] = all_data['y'].astype('int64')

In [6]:
all_data, tmp = train_test_split(all_data, shuffle=True, random_state=42, train_size=0.025, stratify=all_data['y'])

Binarizing data

In [7]:
def max_qcut(series):
    num_cut = 2
    while(True):
        try:
            pd.qcut(series, num_cut)
        except ValueError:
            return num_cut-1
        else:
            num_cut+=1

In [9]:
qcut = 8

X, y = all_data.drop(columns = 'y').copy(), all_data['y'].copy()

qcut_cols = X.select_dtypes(['int64']).columns
    
for column in qcut_cols:
    X[column] = pd.qcut(X[column], min(qcut, max_qcut(X[column])))

In [31]:
X = pd.get_dummies(X)

In [32]:
X.shape

(1079, 41)

Assesing performance

In [33]:
def cross_val_score(X, y, n_splits, scoring):
    skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
    scores = np.zeros((len(scoring), skf.get_n_splits()))

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]

        bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.values)
        bin_cls.predict(X_test.values)

        # we always classify non prediction as false prediction
        tmp_predict = np.copy(bin_cls.predictions)
        tmp_predict[tmp_predict == -1] = np.abs(y_test.values[tmp_predict == -1] - 1)

        for j, score in enumerate(scoring):
            scores[j, i] = score(y_test.values, tmp_predict)
    
    return scores.mean(1).round(3)


In [34]:
cross_val_score(X, y, 5, [accuracy_score, f1_score])

                                                 

array([0.844, 0.171])