In [1]:
import pandas as pd
import numpy as np

In [2]:
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [3]:
df = pd.read_csv('bank-full.csv', sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df = df[["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Q1

In [6]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Q2

In [7]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
df[numerical].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [9]:
# age and balance     : 0.097783
# day and campaign    : 0.162490
# day and pdays       : -0.093044
# pdays and previous  : 0.454820

In [10]:
df.y = (df.y == 'yes').astype(int)

In [11]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_full_train['y']
del df_train['y']
del df_val['y']
del df_test['y']

In [12]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

## Q3

In [13]:
from sklearn.metrics import mutual_info_score

categorical = ["job", "marital", "education", "housing", "contact", "month", "poutcome"]

def mutual_info_churn_score(series):
    return round(mutual_info_score(series, y_train), 2)

mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

## Q4

In [14]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

dicts_train = df_train.to_dict(orient='records')
dicts_val = df_val.to_dict(orient='records')
dicts_test = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dicts_train)
X_val = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

(y_pred == y_val).mean().round(2)

0.9

## Q5

In [15]:
result = []

original_accuracy = (y_pred == y_val).mean()

for feature in df_train.columns:

    dicts_train = df_train.drop(columns=feature).to_dict(orient='records')
    dicts_val = df_val.drop(columns=feature).to_dict(orient='records')

    dv = DictVectorizer(sparse=False)

    X_train = dv.fit_transform(dicts_train)
    X_val = dv.transform(dicts_val)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_excluded_feature = model.predict(X_val)

    current_accuracy = (y_pred_excluded_feature == y_val).mean()

    difference = abs(original_accuracy - current_accuracy)

    result.append([feature, current_accuracy, difference])

df_result = pd.DataFrame(data=result, columns=['excluded_feature', 'accuracy', 'difference'])

In [16]:
df_result.sort_values(by='difference')

Unnamed: 0,excluded_feature,accuracy,difference
12,previous,0.901128,0.0
0,age,0.901239,0.000111
3,education,0.901017,0.000111
11,pdays,0.901017,0.000111
2,marital,0.901349,0.000221
7,day,0.901349,0.000221
1,job,0.900796,0.000332
6,contact,0.900796,0.000332
4,balance,0.900686,0.000442
5,housing,0.900464,0.000664


## Q6

In [17]:
result = []

dicts_train = df_train.to_dict(orient='records')
dicts_val = df_val.to_dict(orient='records')
dicts_test = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dicts_train)
X_val = dv.transform(dicts_val)

for c in [0.01, 0.1, 1, 10, 100]:

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = (y_pred == y_val).mean().round(3)

    result.append([c, accuracy])

df_result = pd.DataFrame(data=result, columns=['C', 'accuracy'])

In [18]:
df_result.sort_values(by='accuracy', ascending=False)

Unnamed: 0,C,accuracy
1,0.1,0.901
2,1.0,0.901
4,100.0,0.901
3,10.0,0.9
0,0.01,0.898
