In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix

import warnings


warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', 30)
font = {'size'   : 18}
plt.rc('font', **font)
plt.rc('xtick', labelsize=18)    
plt.rc('ytick', labelsize=18)

In [3]:
data = pd.read_csv('bank-full.csv', delimiter = ';')

data.drop('duration', axis = 1, inplace = True)

In [4]:
for x in data.select_dtypes('object').columns:
        data[x] = data[x].astype('category')

In [5]:
year_cur = 2008
year_col = []
flag = 0
for x in data.month:
    if (x == 'mar') & (flag != 1) :
        year_cur = year_cur + 1
        flag = 1
    if x == 'apr':
        flag = 0    
    year_col.append(year_cur)

data['year'] = year_col

In [6]:
data.describe()

Unnamed: 0,age,balance,day,campaign,pdays,previous,year
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,2.763841,40.197828,0.580323,2008.355002
std,10.618762,3044.765829,8.322476,3.098021,100.128746,2.303441,0.565001
min,18.0,-8019.0,1.0,1.0,-1.0,0.0,2008.0
25%,33.0,72.0,8.0,1.0,-1.0,0.0,2008.0
50%,39.0,448.0,16.0,2.0,-1.0,0.0,2008.0
75%,48.0,1428.0,21.0,3.0,-1.0,0.0,2009.0
max,95.0,102127.0,31.0,63.0,871.0,275.0,2010.0


In [7]:
def df_categories_to_dummies(df, exception = 'y'):
    col_list = list(data.select_dtypes('category').columns)
    col_list.remove(exception)
    for x in col_list:
        df_temp = pd.get_dummies(df[x], drop_first=True)
        df_temp.columns = [f'{x}.{name}' for name in df_temp.columns]
        df = df.join(df_temp)
        df.drop(x, inplace = True, axis = 1)
    return df

In [8]:
tl_numer = df_categories_to_dummies(data)
tl_numer['pdays_cat'] = pd.cut(data.pdays, bins = [0, 7, 14, 31, 1000])
tl_numer.pdays_cat = tl_numer.pdays_cat.cat.codes
tl_numer.drop('pdays', inplace = True, axis = 1)

In [9]:

def test_train_split(condition, df):
    
    y = np.array([1 if x == 'yes' else 0 for x in df.y])
    return y[condition], df[condition], y[~condition], df[~condition]

def drop_cols(df_array, col_array = ['y', 'year']):
   
    for df in df_array:
        df.drop(col_array, axis = 1, inplace = True)

In [10]:
# Splitting the dataset into a test and train sets
test_set_cond = tl_numer.year == 2010
y_test, x_test, y_train_val, tl_train_val = test_train_split(test_set_cond, tl_numer)
x_test.shape[0]/tl_numer.shape[0], tl_train_val.shape[0]/tl_numer.shape[0]

(0.04512176240295503, 0.9548782375970449)

In [11]:
# Splitting the train set into a train set and a validation set
val_set_cond = ((tl_train_val['month.dec'] == 1) | (tl_train_val['month.nov'] == 1) |
                (tl_train_val['month.oct'] == 1) | (tl_train_val['month.sep'] == 1) |
                (tl_train_val['month.aug'] == 1) | (tl_train_val['month.jul'] == 1) |
                (tl_train_val['month.jun'] == 1) | (tl_train_val['month.may'] == 1)) \
               & (tl_train_val.year == 2009) 
y_val, x_val, y_train, x_train = test_train_split(val_set_cond, tl_train_val)
x_val.shape[0]/tl_train_val.shape[0], x_train.shape[0]/tl_train_val.shape[0]

(0.1948993537328299, 0.80510064626717)

In [12]:
print(sum(y_train == 1), sum(y_train == 0))
print(sum(y_val == 1), sum(y_val == 0))
print(sum(y_test == 1), sum(y_test == 0))

2560 32197
1667 6747
1062 978
