In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import requests, zipfile

from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

Getting the data

In [2]:
r = requests.get('https://archive.ics.uci.edu/static/public/222/bank+marketing.zip')
z = zipfile.ZipFile(BytesIO(r.content))
zz = zipfile.ZipFile(BytesIO(z.read('bank.zip')))
df = pd.read_csv(BytesIO(zz.read('bank-full.csv')), sep=';')
df.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


Removing the 'default' column

In [3]:
df.drop('default', axis=1, inplace=True)
df.head(3)

Unnamed: 0,age,job,marital,education,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


Checking for NA 

In [4]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1
What is the most frequent observation (mode) for the column education?

In [5]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

### Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [6]:
int_cols = df.columns[df.dtypes == np.int64]
c = df[int_cols].corr(numeric_only=True)
c

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [7]:
for i in range(c.shape[0]):
    c.iloc[i, i] = 0
s = c.abs().unstack()
so = s.sort_values(kind="quicksort", ascending=False)
so.head(5)

pdays     previous    0.454820
previous  pdays       0.454820
day       campaign    0.162490
campaign  day         0.162490
age       balance     0.097783
dtype: float64

#### Target encoding
* Now we want to encode the y variable.
* Let's replace the values yes/no with 1/0

In [8]:
df['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [9]:
# Avoiding FutureWarning
df['y'] = [1 if c == 'yes' else 0 for c in df['y'] ]
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

### Split the data
* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value y is not in your dataframe.


In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

X_train = df_train.loc[:, df.columns != 'y'].copy()
X_val = df_val.loc[:, df.columns != 'y'].copy()
X_test = df_test.loc[:, df.columns != 'y'].copy()

y_train = df_train['y'].values
y_val = df_val['y'].values
y_test = df_test['y'].values

del df_train
del df_val
del df_test

for d in [X_train, X_val, X_test]:
    print(round(len(d)/len(df),2))

0.6
0.2
0.2


### Question 3
* Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

In [11]:
categorical = X_train.columns[X_train.dtypes == object]
categorical

Index(['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month',
       'poutcome'],
      dtype='object')

In [12]:
mi = X_train[categorical].apply(lambda c:mutual_info_score(c, y_train))
mi.sort_values(ascending=False)

poutcome     0.029533
month        0.025090
contact      0.013356
housing      0.010343
job          0.007316
loan         0.002714
education    0.002697
marital      0.002050
dtype: float64

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

In [13]:
dv = DictVectorizer(sparse=False)

train_dict = X_train.to_dict(orient='records')
X_train_1hot = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient='records')
X_val_1hot = dv.transform(val_dict)


In [14]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_1hot, y_train)

In [15]:
y_pred = model.predict_proba(X_val_1hot)[:, 1]

y_decision = (y_pred >= 0.5).astype(int)

acc_original = np.mean(y_val == y_decision)
round(acc_original, 2)

0.9

### Question 5

* Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

In [16]:
acc_diff = {}

for feature in X_train.columns:
    train_dict_f = X_train.loc[:, X_train.columns != feature].to_dict(orient='records')
    X_train_1hot_f = dv.fit_transform(train_dict_f)
    
    val_dict_f = X_val.loc[:, X_val.columns != feature].to_dict(orient='records')
    X_val_1hot_f = dv.transform(val_dict_f)
    
    model_f = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_f.fit(X_train_1hot_f, y_train)
    
    y_decision_f = (model_f.predict_proba(X_val_1hot_f)[:, 1] >= 0.5).astype(int)
    
    acc_diff[feature] = round(abs(acc_original - np.mean(y_val == y_decision_f)), 4)

{k: v for k, v in sorted(acc_diff.items(), key=lambda item: item[1], reverse=True)}

{'duration': 0.0112,
 'poutcome': 0.0058,
 'month': 0.0011,
 'loan': 0.0008,
 'marital': 0.0007,
 'pdays': 0.0007,
 'age': 0.0004,
 'contact': 0.0003,
 'balance': 0.0002,
 'day': 0.0002,
 'campaign': 0.0002,
 'job': 0.0001,
 'education': 0.0001,
 'previous': 0.0001,
 'housing': 0.0}

### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

In [17]:
accs = {}

for c in [0.01, 0.1, 1, 10, 100]:    
    model_c = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model_c.fit(X_train_1hot, y_train) # From Question 4
    
    y_pred_c = model_c.predict_proba(X_val_1hot)[:, 1]   
    y_decision_c = (y_pred_c >= 0.5).astype(int)
    
    accs[c] = round(np.mean(y_val == y_decision_c),3)

for k, v in sorted(accs.items(), key=lambda item: item[1], reverse=True):
    print(k, '\t', v)

0.1 	 0.9
1 	 0.9
10 	 0.9
100 	 0.9
0.01 	 0.898
