In [73]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [74]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [75]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [76]:
columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y', 'job', 'marital', 'education', 'housing', 'contact', 'month']
cols_x= ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'job', 'marital', 'education', 'housing', 'contact', 'month']
len(cols_x)

14

In [77]:
df = df[columns]
df.isna().sum()

age          0
balance      0
day          0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
job          0
marital      0
education    0
housing      0
contact      0
month        0
dtype: int64

Q1) secondary

In [78]:
df.education.mode()

0    secondary
dtype: object

Q2) pdays and previous

In [79]:
# Correlation matrix
df[columns].corr().abs()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,0.00912,0.004648,0.00476,0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,0.014578,0.003435,0.016674
day,0.00912,0.004503,1.0,0.030206,0.16249,0.093044,0.05171
duration,0.004648,0.02156,0.030206,1.0,0.08457,0.001565,0.001203
campaign,0.00476,0.014578,0.16249,0.08457,1.0,0.088628,0.032855
pdays,0.023758,0.003435,0.093044,0.001565,0.088628,1.0,0.45482
previous,0.001288,0.016674,0.05171,0.001203,0.032855,0.45482,1.0


In [80]:
cor= df[columns].corr().abs()
cor.unstack()[cor.unstack().lt(1)].idxmax()

('pdays', 'previous')

In [81]:
# encode y
df.y = df.y.apply(lambda x: 1 if x == 'yes' else 0).astype(int)

In [82]:
df.y.value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [83]:
numerical = df.select_dtypes("number").columns.to_list()
categorical = df.select_dtypes("object").columns.to_list()
numerical.remove("y")

In [84]:
# split
from sklearn.model_selection import train_test_split
X = df[numerical+categorical]
y = df.y
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

Q3) poutcome

In [86]:
from sklearn.metrics import mutual_info_score

for col in ["contact", "education", "housing", "poutcome"]:
    print(round(mutual_info_score(y_train, X_train[col]), 5))

0.01336
0.0027
0.01034
0.02953


Q4) 0.9

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


In [88]:
dict_train = X_train.to_dict(orient='records')
dict_valid = X_valid.to_dict(orient='records')
dict_test = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train_2 = dv.fit_transform(dict_train)
X_valid_2 = dv.transform(dict_valid)
X_test_2 = dv.transform(dict_test)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [89]:
model.fit(X_train_2, y_train)

In [90]:
val_accuracy = (model.predict(X_valid_2) == y_valid).mean()
val_accuracy

0.9015704490157045

Q5) marital

In [91]:
cols_q5 = ['age', 'balance', 'marital', 'previous']
result =[]

for col in cols_q5:
    dict_train = X_train.drop(columns=col).to_dict(orient='records')
    dict_valid = X_valid.drop(columns=col).to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train_new = dv.fit_transform(dict_train)
    X_valid_new = dv.transform(dict_valid)
    #  exclude col and train model
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_new, y_train)

    val_small = (model.predict(X_valid_new) == y_valid).mean()
    result.append((col, val_small, abs(val_accuracy - val_small)))
    print(f'Excluding {col}: {(val_accuracy - val_small)}')

pd.DataFrame(result, columns=['feature', 'accuracy', 'diff']).sort_values(by='diff')


Excluding age: 0.00033178500331787486
Excluding balance: 0.00022119000221187957
Excluding marital: 0.0014377350143773837
Excluding previous: 0.00033178500331787486


Unnamed: 0,feature,accuracy,diff
1,balance,0.901349,0.000221
0,age,0.901239,0.000332
3,previous,0.901239,0.000332
2,marital,0.900133,0.001438


Q6) 1

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_train = X_train.to_dict(orient="records")
dicts_val = X_valid.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train6 = dv.transform(dicts_train)
X_val6 = dv.transform(dicts_val)

results = []

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train6, y_train)

    y_pred_val = model.predict(X_val6)

    accuracy = (y_pred_val == y_valid).mean().round(3)

    results.append((c, accuracy))

    df_results = pd.DataFrame(data=results, columns=["C", "accuracy"])

display(df_results.sort_values("accuracy", ascending=False))
