In [1]:
#!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('bank-full.csv',sep=";",)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Data preparation

In [4]:
features = [ "age","job","marital","education","balance","housing","contact","day","month","duration","campaign","pdays","previous","poutcome","y"]
df = df[features]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
missing_columns = df.isnull().sum()
columns_with_missing_values = missing_columns[missing_columns > 0]
columns_with_missing_values

Series([], dtype: int64)

### Question 1

What is the most frequent observation (mode) for the column education?

In [6]:
df["education"].mode()[0]

'secondary'

### Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

#### Target encoding

In [7]:
df.y = (df.y == 'yes').astype(int)
df.housing = (df.housing == 'yes').astype(int)
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,1,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,1,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,1,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,0,unknown,5,may,198,1,-1,0,unknown,0


#### Split the data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value y is not in your dataframe.

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
seed = 42
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [12]:
from sklearn.metrics import mutual_info_score

In [13]:
def getCorrelation(col1,col2):
    return mutual_info_score(df_full_train[col1], df_full_train[col2])

age and balance

In [14]:
age_balance = getCorrelation("age","balance")
age_balance

1.5914475511949997

day and campaign

In [15]:
day_campaign= getCorrelation("day","campaign")
day_campaign

0.04550264278576513

day and pdays

In [16]:
day_pdays= getCorrelation("day","pdays")
day_pdays

0.2454079135156819

pdays and previous

In [17]:
pdays_previous= getCorrelation("pdays","previous")
pdays_previous

0.5458577044542354

### Question 3


- Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

In [18]:
flist = ["contact","education","housing","poutcome"]

In [19]:
def mutual_info_y_score(series):
    return mutual_info_score(series, y_train)

In [20]:
mi = df_train[flist].apply(mutual_info_y_score)
mi

contact      0.013356
education    0.002697
housing      0.010343
poutcome     0.029533
dtype: float64

### Question4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [23]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
dv = DictVectorizer(sparse=False)

# train_dict = df_train.to_dict(orient='records')
# X_train = dv.fit_transform(train_dict)

# val_dict = df_val.to_dict(orient='records')
# X_val = dv.transform(val_dict)

In [34]:
def getPred(df_train, df_val):
    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return df_pred

In [35]:
# model.fit(X_train, y_train)

In [36]:
# y_pred = model.predict_proba(X_val)[:, 1]

In [37]:
# decision = (y_pred >= 0.5)

In [38]:
# df_pred = pd.DataFrame()
# df_pred['probability'] = y_pred
# df_pred['prediction'] = decision.astype(int)
# df_pred['actual'] = y_val
# df_pred['correct'] = df_pred.prediction == df_pred.actual

In [54]:
df_pred = getPred(df_train, df_val)
original_accuracy = df_pred.correct.mean()
round(original_accuracy,2)

0.9

### Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [42]:
def dropAndPred(col):
    return getPred(df_train.drop(columns=[col]), df_val.drop(columns=[col]))

In [43]:
# dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

In [49]:
age_df_pred = dropAndPred('age')
age_df_pred.correct.mean()-original_accuracy 

-0.0011059500110595089

In [50]:
balance_df_pred = dropAndPred('balance')
balance_df_pred.correct.mean()-original_accuracy 

-0.0006635700066357497

In [51]:
marital_df_pred = dropAndPred('marital')
marital_df_pred.correct.mean()-original_accuracy 

-0.00044238000442375913

In [52]:
previous_df_pred = dropAndPred('previous')
previous_df_pred.correct.mean()-original_accuracy 

-0.0009953550099535136

### Question 6

- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0, 0.01, 0.1, 1, 10].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

In [53]:
C=  [0, 0.01, 0.1, 1, 10]

In [56]:
def getPredbyModel(model, df_train, df_val):
    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    decision = (y_pred >= 0.5)
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    return df_pred

In [58]:
for c in C:
    model = LogisticRegression(solver='liblinear', C=float(c), max_iter=1000, random_state=42)
    df_pred = getPredbyModel(model,df_train, df_val)
    accuracy = df_pred.correct.mean()
    print(c,"\t>>",accuracy)

InvalidParameterError: The 'C' parameter of LogisticRegression must be a float in the range (0.0, inf]. Got 0.0 instead.