In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

### Dataset
In this homework, we will use the Bank Marketing dataset. Download it from [here](https://archive.ics.uci.edu/static/public/222/bank+marketing.zip).

Or you can do it with wget:

`wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip`

We need to take bank/bank-full.csv file from the downloaded zip-file.

In this dataset our desired target for classification task will be y variable - has the client subscribed a term deposit or not.

### Features
For the rest of the homework, you'll need to use only these columns:

`age,
job,
marital,
education,
balance,
housing,
contact,
day,
month,
duration,
campaign,
pdays,
previous,
poutcome,
y`

### Data preparation
Select only the features from above.

Check if the missing values are presented in the features.


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
bank_full = "data/bank-full.csv"

In [4]:
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 
           'pdays', 'previous', 'poutcome', 'y']
df = pd.read_csv(bank_full, usecols=columns, delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1
What is the most frequent observation (mode) for the column `education`?

* unknown
* primary
* secondary
* tertiary

In [6]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

### Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

* `age` and `balance`
* `day` and `campaign`
* `day` and `pdays`
* `pdays` and `previous`

In [7]:
df.select_dtypes(include='int64').corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


* `age` and `balance`: 0.097783
* `day` and `campaign`: 0.162490
* `day` and `pdays`: -0.093044
* `pdays` and `previous`: **0.454820**

### Target encoding
Now we want to encode the `y` variable.
Let's replace the values `yes/no` with `1/0`.

### Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.

Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

Make sure that the target value y is not in your dataframe.

In [8]:
df['y'].unique()

array(['no', 'yes'], dtype=object)

In [9]:
df['y'] = df['y'].apply(lambda y: 1 if y=="yes" else 0)

In [10]:
X = df.drop(['y'], axis=1)
y = df['y'].copy()
print(X.shape, y.shape)

(45211, 14) (45211,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.25, random_state=42)

In [12]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((27126, 14), (9042, 14), (9043, 14), (27126,), (9042,), (9043,))

In [13]:
X_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
20326,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
24301,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
38618,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
18909,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
23081,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [14]:
y_train.head()

20326    0
24301    0
38618    0
18909    0
23081    0
Name: y, dtype: int64

### Question 3
Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.

Round the scores to 2 decimals using `round(score, 2)`.
Which of these variables has the biggest mutual information score?

* contact
* education
* housing
* poutcome

In [15]:
for c in ['contact', 'education', 'housing', 'poutcome']:
    print(f'Mutual info between y and {c}: {round(mutual_info_score(y_train, X_train[c]), 2)}')

Mutual info between y and contact: 0.01
Mutual info between y and education: 0.0
Mutual info between y and housing: 0.01
Mutual info between y and poutcome: 0.03


### Question 4
Now let's train a logistic regression.

Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
`model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

* 0.6
* 0.7
* 0.8
* 0.9

In [16]:
X_train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(X_train_dict)
dv_train = dv.transform(X_train_dict)

In [17]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(dv_train, y_train)

In [18]:
X_val_dict = X_val.to_dict(orient='records')
dv_val = dv.transform(X_val_dict)

In [19]:
y_pred = (model.predict_proba(dv_val)[:,1]) > 0.5
round((y_pred == y_val).mean(), 2)

0.9

### Question 5
Let's find the least useful feature using the **feature elimination** technique.

Train a model with all these features (using the same parameters as in Q4).

Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

* age
* balance
* marital
* previous

Note: The difference doesn't have to be positive.

In [20]:
def eval_model(X_train, y_train, X_val, y_val, drop_column=None, C=1.0):
    if drop_column:
        X_train = X_train.drop([drop_column], axis=1)
        X_val = X_val.drop([drop_column], axis=1)
        
    # one-hot encoding using DictVectorizer    
    X_train_dict = X_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(X_train_dict)
    dv_train = dv.transform(X_train_dict)

    # train LogisticRegression
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(dv_train, y_train)

    # one-hot encoding on validation dataset
    X_val_dict = X_val.to_dict(orient='records')
    dv_val = dv.transform(X_val_dict)

    # predict using validation dataset and compute accuracy
    y_pred = (model.predict_proba(dv_val)[:,1]) > 0.5
    # return round((y_pred == y_val).mean(), 2)
    return (y_pred == y_val).mean()

In [21]:
original_accuracy = eval_model(X_train, y_train, X_val, y_val)

diff = []
features = ['age', 'balance', 'marital', 'previous']
for f in features:
    diff.append(abs(original_accuracy - eval_model(X_train, y_train, X_val, y_val, f)))    

In [22]:
# zip the features and differences and convert into dictionary: {feature: difference}
dict_diff = dict(zip(features, diff))

# sort dict where the sorting key is the difference of each item, then print out the feature name of the smallest difference (position 0).
print(sorted(dict_diff.items(), key=lambda x: x[1])[0][0])

previous


### Question 6
Now let's train a regularized logistic regression.

Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].

Train models using all the features as in Q4.

Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

* 0.01
* 0.1
* 1
* 10
* 100

Note: If there are multiple options, select the smallest C.

In [23]:
accuracies = []
Cs = [0.01, 0.1, 1, 10, 100]
for c in Cs:
    accuracies.append(round(eval_model(X_train, y_train, X_val, y_val, None, c), 3))   

In [24]:
# zip the C and accuracies and convert into dictionary: {C: accuracy}
dict_accu = dict(zip(Cs, accuracies))
# sort dict where the sorting key is the accuracy of each item
print(sorted(dict_accu.items(), key=lambda x: x[1], reverse=True))

[(100, 0.902), (0.1, 0.901), (1, 0.901), (10, 0.901), (0.01, 0.898)]
