In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset

In [10]:
import urllib.request
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
zip_file_name = 'data/data.zip'
urllib.request.urlretrieve(url, zip_file_name)

('data/data.zip', <http.client.HTTPMessage at 0x1bb60a3bc20>)

# Preparing the dataset

In [18]:
df = pd.read_csv('data/bank-full.csv', sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [19]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

For the rest of the homework, you'll need to use only these columns:

- age,
- job,
- marital,
- education,
- balance,
- housing,
- contact,
- day,
- month,
- duration,
- campaign,
- pdays,
- previous,
- poutcome,
- y

In [20]:
df = df.drop(columns= ['default', 'loan'])
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


- Check if the missing values are presented in the features

In [21]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### Question 1

What is the most frequent observation (mode) for the column `education`?

- `unknown`
- `primary`
- `secondary`
- `tertiary`

In [81]:
df['education'].mode()[0]

'secondary'

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`

In [27]:
df.select_dtypes(include=['number']).corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [31]:
df[['age']].corrwith(df['balance'])

age    0.097783
dtype: float64

In [32]:
df[['day']].corrwith(df['campaign'])

day    0.16249
dtype: float64

In [33]:
df[['day']].corrwith(df['pdays'])

day   -0.093044
dtype: float64

In [34]:
df[['pdays']].corrwith(df['previous'])

pdays    0.45482
dtype: float64

### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.

In [38]:
df.y = (df.y == 'yes').astype(int)

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

(36168, 9043)

In [43]:
df_train, df_val = train_test_split(df_full_train, test_size=len(df_test), random_state=42)
len(df_train), len(df_val), len(df_test)

(27125, 9043, 9043)

In [44]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['y'].values
y_val = df_val['y'].values
y_test = df_test['y'].values

del df_train["y"]
del df_val["y"]
del df_test["y"]

In [73]:
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = df_full_train['y'].values
del df_full_train['y']

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`

In [47]:
from sklearn.metrics import mutual_info_score
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [78]:
mi = df_train.select_dtypes(include=['object']).apply(mutual_info_churn_score)
mi.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [49]:
from sklearn.feature_extraction import DictVectorizer

In [51]:
train_dicts = df_train.to_dict(orient='records')
train_dicts[:10]

[{'age': 38,
  'job': 'entrepreneur',
  'marital': 'married',
  'education': 'secondary',
  'balance': 0,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 17,
  'month': 'nov',
  'duration': 258,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 49,
  'job': 'blue-collar',
  'marital': 'married',
  'education': 'secondary',
  'balance': 3309,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 15,
  'month': 'may',
  'duration': 349,
  'campaign': 2,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 37,
  'job': 'housemaid',
  'marital': 'married',
  'education': 'primary',
  'balance': 2410,
  'housing': 'no',
  'contact': 'cellular',
  'day': 4,
  'month': 'aug',
  'duration': 315,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 31,
  'job': 'self-employed',
  'marital': 'married',
  'education': 'tertiary',
  'balance': 3220,
  'housing': 'no',
  'contact': 'cellular',
  'day': 26,
  'month

In [52]:
dv = DictVectorizer(sparse=False)

In [53]:
X_train = dv.fit_transform(train_dicts)
X_train

array([[3.800e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.900e+01, 3.309e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.700e+01, 2.410e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.400e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.500e+01, 2.311e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.000e+01, 1.500e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [54]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)
X_val

array([[ 3.800e+01, -1.000e+01,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 4.200e+01,  1.146e+03,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 4.300e+01,  1.490e+02,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       ...,
       [ 4.000e+01,  1.530e+02,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 4.500e+01,  7.000e+00,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 3.200e+01,  1.100e+03,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00]])

In [55]:
from sklearn.linear_model import LogisticRegression

In [56]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [58]:
y_pred = model.predict(X_val)

In [61]:
score = (y_pred == y_val).mean()
score

0.9010284197721995

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.

In [66]:
from sklearn.metrics import accuracy_score

In [77]:
for col in df_full_train.columns:
    df_train_small = df_train.drop(columns = [col])
    df_val_small = df_val.drop(columns = [col])
    
    dv = DictVectorizer(sparse=False)
    train_small_dicts = df_train_small.to_dict(orient='records')
    X_train_small = dv.fit_transform(train_small_dicts)
    val_small_dicts = df_val_small.to_dict(orient='records')
    X_val_small = dv.transform(val_small_dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_small, y_train)
    
    y_pred_small = model.predict(X_val_small)
    print(col, abs(score - accuracy_score(y_val, y_pred_small)))

age 0.0006634966272254239
job 0.00033174831361271195
marital 0.0005529138560211866
education 0.00044233108481694927
balance 0.00011058277120423732
housing 0.00033174831361271195
contact 0.0005529138560211866
day 0.0005529138560211866
month 0.0011058277120424842
duration 0.010947694349220383
campaign 0.00044233108481694927
pdays 0.0
previous 0.00033174831361271195
poutcome 0.007409045670684566


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

In [79]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(c, round(accuracy_score(y_val, y_pred), 3))

0.01 0.898
0.1 0.901
1 0.901
10 0.901
100 0.901
