In [40]:
import pandas as pd
import numpy as np

In [41]:
df = pd.read_csv("bank-full.csv", sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# Data preparation

In [42]:
df = df[["age","job","marital","education","balance","housing","contact","day","month","duration","campaign","pdays","previous","poutcome","y"]]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


df.isnull().sum()

# Q1. Most frequent observation for the column education

In [43]:
df.education.mode()

0    secondary
Name: education, dtype: object

# Q2. Correlation matrix

In [44]:
correlatoin_matrix = df[["age","balance","day","duration","campaign","pdays","previous"]].corr()
correlatoin_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


#### pdays and previous have the biggest correlation

#### Target encoding

In [45]:
df.y = (df.y == "yes").astype(int)
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

#### Split the data

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
categories = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome","y"] 
X = df[categories]
X_full_train, X_test = train_test_split(X, test_size= 0.2, random_state=42)
X_train, X_val = train_test_split(X_full_train, test_size= 0.25, random_state=42)

# Q3. mutual information score

In [48]:
global_y = round(X_train.y.mean(),2)
global_y

np.float64(0.12)

In [49]:
unique_contact = X_train['contact'].unique()
for i in unique_contact:
    print(X_train[X_train.contact == i ].y.mean())

0.14741511687425354
0.04034840527731523
0.12730414746543778


In [50]:
unique_education = X_train['education'].unique()
for i in unique_education:
    print(X_train[X_train.education == i ].y.mean())

0.15013336720436937
0.1038442336495257
0.08501827040194884
0.1251109139307897


In [51]:
unique_housing = X_train['housing'].unique()
for i in unique_housing:
    print(X_train[X_train.housing == i ].y.mean())

0.07432387753752562
0.1669582604348913


In [52]:
unique_poutcome = X_train['poutcome'].unique()
for i in unique_poutcome:
    print(X_train[X_train.poutcome == i ].y.mean())

0.09007510005846113
0.6460176991150443
0.16620752984389348
0.12430939226519337


In [53]:
from IPython.display import display

In [54]:
for c in categories:
    df_group = X_train.groupby(c).y.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - global_y 
    df_group['risk'] = df_group['mean'] / global_y 
    display(df_group)
    print()
    print()


Unnamed: 0_level_0,mean,count,diff,risk
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18,0.000000,2,-0.120000,0.000000
19,0.375000,16,0.255000,3.125000
20,0.280000,25,0.160000,2.333333
21,0.217391,46,0.097391,1.811594
22,0.300000,80,0.180000,2.500000
...,...,...,...,...
89,0.000000,3,-0.120000,0.000000
90,1.000000,1,0.880000,8.333333
92,1.000000,1,0.880000,8.333333
93,1.000000,1,0.880000,8.333333






Unnamed: 0_level_0,mean,count,diff,risk
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
admin.,0.121076,3122,0.001076,1.008969
blue-collar,0.072614,5784,-0.047386,0.605118
entrepreneur,0.087457,869,-0.032543,0.728807
housemaid,0.091026,780,-0.028974,0.758547
management,0.136534,5603,0.016534,1.137783
retired,0.215054,1395,0.095054,1.792115
self-employed,0.110775,993,-0.009225,0.923129
services,0.09,2500,-0.03,0.75
student,0.270321,529,0.150321,2.252678
technician,0.107903,4606,-0.012097,0.899189






Unnamed: 0_level_0,mean,count,diff,risk
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
divorced,0.117759,3159,-0.002241,0.981323
married,0.099841,16306,-0.020159,0.832005
single,0.147239,7661,0.027239,1.226994






Unnamed: 0_level_0,mean,count,diff,risk
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
primary,0.085018,4105,-0.034982,0.708486
secondary,0.103844,14021,-0.016156,0.865369
tertiary,0.150133,7873,0.030133,1.251111
unknown,0.125111,1127,0.005111,1.042591






Unnamed: 0_level_0,mean,count,diff,risk
balance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8019,0.0,1,-0.12,0.000000
-6847,0.0,1,-0.12,0.000000
-4057,0.0,1,-0.12,0.000000
-3313,0.0,1,-0.12,0.000000
-3058,1.0,1,0.88,8.333333
...,...,...,...,...
59649,0.0,1,-0.12,0.000000
64343,0.0,1,-0.12,0.000000
71188,0.0,1,-0.12,0.000000
81204,1.0,1,0.88,8.333333






Unnamed: 0_level_0,mean,count,diff,risk
housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.166958,12003,0.046958,1.391319
yes,0.074324,15123,-0.045676,0.619366






Unnamed: 0_level_0,mean,count,diff,risk
contact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cellular,0.147415,17583,0.027415,1.228459
telephone,0.127304,1736,0.007304,1.060868
unknown,0.040348,7807,-0.079652,0.336237






Unnamed: 0_level_0,mean,count,diff,risk
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.254054,185,0.134054,2.117117
2,0.149045,785,0.029045,1.242038
3,0.145963,644,0.025963,1.216356
4,0.156431,863,0.036431,1.303592
5,0.114983,1148,-0.005017,0.958188
6,0.093777,1173,-0.026223,0.781472
7,0.087719,1083,-0.032281,0.730994
8,0.106383,1081,-0.013617,0.886525
9,0.107643,929,-0.012357,0.897022
10,0.239203,301,0.119203,1.993355






Unnamed: 0_level_0,mean,count,diff,risk
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apr,0.204853,1772,0.084853,1.707111
aug,0.110663,3723,-0.009337,0.922195
dec,0.465649,131,0.345649,3.880407
feb,0.156962,1580,0.036962,1.308017
jan,0.104142,845,-0.015858,0.86785
jul,0.090953,4167,-0.029047,0.757939
jun,0.09833,3234,-0.02167,0.819419
mar,0.521429,280,0.401429,4.345238
may,0.06445,8239,-0.05555,0.53708
nov,0.096801,2376,-0.023199,0.806678






Unnamed: 0_level_0,mean,count,diff,risk
duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,2,-0.12,0.000000
1,0.0,1,-0.12,0.000000
2,0.0,2,-0.12,0.000000
3,0.0,4,-0.12,0.000000
4,0.0,12,-0.12,0.000000
...,...,...,...,...
3322,0.0,1,-0.12,0.000000
3366,0.0,1,-0.12,0.000000
3785,0.0,1,-0.12,0.000000
3881,1.0,1,0.88,8.333333






Unnamed: 0_level_0,mean,count,diff,risk
campaign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.142019,10569,0.022019,1.183493
2,0.111481,7508,-0.008519,0.929009
3,0.109008,3275,-0.010992,0.908397
4,0.08999,2078,-0.03001,0.74992
5,0.084337,1079,-0.035663,0.702811
6,0.071332,743,-0.048668,0.594437
7,0.075221,452,-0.044779,0.626844
8,0.059006,322,-0.060994,0.491718
9,0.076923,208,-0.043077,0.641026
10,0.040268,149,-0.079732,0.33557






Unnamed: 0_level_0,mean,count,diff,risk
pdays,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,0.090001,22233,-0.029999,0.750011
1,0.500000,10,0.380000,4.166667
2,0.086957,23,-0.033043,0.724638
3,0.000000,1,-0.120000,0.000000
4,0.500000,2,0.380000,4.166667
...,...,...,...,...
828,1.000000,1,0.880000,8.333333
831,0.000000,1,-0.120000,0.000000
850,0.000000,1,-0.120000,0.000000
854,1.000000,1,0.880000,8.333333






Unnamed: 0_level_0,mean,count,diff,risk
previous,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.090001,22233,-0.029999,0.750011
1,0.208209,1681,0.088209,1.735078
2,0.220588,1224,0.100588,1.838235
3,0.250769,650,0.130769,2.089744
4,0.244656,421,0.124656,2.038797
5,0.286765,272,0.166765,2.389706
6,0.297143,175,0.177143,2.47619
7,0.221374,131,0.101374,1.844784
8,0.310811,74,0.190811,2.59009
9,0.216667,60,0.096667,1.805556






Unnamed: 0_level_0,mean,count,diff,risk
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
failure,0.124309,2896,0.004309,1.035912
other,0.166208,1089,0.046208,1.385063
success,0.646018,904,0.526018,5.383481
unknown,0.090075,22237,-0.029925,0.750626






Unnamed: 0_level_0,mean,count,diff,risk
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,23998,-0.12,0.0
1,1.0,3128,0.88,8.333333






In [55]:
from sklearn.metrics import mutual_info_score

In [56]:
categories = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome"] 
mutual_scores= []
for c in categories:
    obj = {'c': c, 'score': mutual_info_score(X_train[c], X_train['y'])}
    mutual_scores.append(obj)
mutual_scores = sorted(mutual_scores, key=lambda x: x['score'], reverse=True)
mutual_scores

[{'c': 'balance', 'score': np.float64(0.11661461717469257)},
 {'c': 'duration', 'score': np.float64(0.09987687953086824)},
 {'c': 'pdays', 'score': np.float64(0.037109232365753475)},
 {'c': 'poutcome', 'score': np.float64(0.029532821290436224)},
 {'c': 'month', 'score': np.float64(0.025090033443650246)},
 {'c': 'contact', 'score': np.float64(0.013356062198247219)},
 {'c': 'previous', 'score': np.float64(0.013153078845818044)},
 {'c': 'age', 'score': np.float64(0.012937835538587615)},
 {'c': 'housing', 'score': np.float64(0.010343105891750026)},
 {'c': 'job', 'score': np.float64(0.007316082778474635)},
 {'c': 'day', 'score': np.float64(0.0063998575390331516)},
 {'c': 'campaign', 'score': np.float64(0.004269091323970776)},
 {'c': 'education', 'score': np.float64(0.0026967549991295282)},
 {'c': 'marital', 'score': np.float64(0.0020495925927810216)}]

#### poutcome has the biggest mutual information score

# Q4. 

In [57]:
from sklearn.feature_extraction import DictVectorizer
def prepare_X(X):
    Y = X['y']
    X = X.drop('y', axis =1)
    train_dicts = X.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(train_dicts)
    return Y, X

In [58]:
Y, X_train = prepare_X(X_train)
Y_val, X_val = prepare_X(X_val)

In [59]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, Y)

In [60]:
y_pred_train = model.predict_proba(X_train)[:,1]
churn_decision = (y_pred_train >= 0.5)
(Y == churn_decision).mean()

np.float64(0.9033399690333996)

In [61]:
y_pred = model.predict_proba(X_val)[:,1]

churn_decision = (y_pred >= 0.5)
accuracy = (Y_val == churn_decision).mean()
accuracy

np.float64(0.9010174740101747)

#### 0.9 is the accuracy

# Q5. 

In [62]:
def split_df(categories):
    X = df[categories]
    X_full_train, X_test = train_test_split(X, test_size= 0.2, random_state=42)
    X_train, X_val = train_test_split(X_full_train, test_size= 0.25, random_state=42)
    return X_train, X_val, X_test

In [63]:
categories = ["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome","y"] 
result = []
for i in range(len(categories) -1):
    X_train, X_val, X_test = split_df( categories[:i] + categories[i+1:len(categories) -2] + categories[len(categories) -1:])
    Y, X_train = prepare_X(X_train)
    Y_val, X_val = prepare_X(X_val)
    model.fit(X_train, Y)
    y_pred = model.predict_proba(X_val)[:,1]
    churn_decision = (y_pred >= 0.5)
    score_val = (Y_val == churn_decision).mean()
    result.append([categories[i], score_val, round(accuracy- score_val,5)]) 

In [64]:
result

[['age', np.float64(0.8929440389294404), np.float64(0.00807)],
 ['job', np.float64(0.8949347489493474), np.float64(0.00608)],
 ['marital', np.float64(0.8931652289316523), np.float64(0.00785)],
 ['education', np.float64(0.893607608936076), np.float64(0.00741)],
 ['balance', np.float64(0.893607608936076), np.float64(0.00741)],
 ['housing', np.float64(0.8947135589471356), np.float64(0.0063)],
 ['contact', np.float64(0.8934970139349702), np.float64(0.00752)],
 ['day', np.float64(0.893607608936076), np.float64(0.00741)],
 ['month', np.float64(0.8908427339084274), np.float64(0.01017)],
 ['duration', np.float64(0.8809997788099978), np.float64(0.02002)],
 ['campaign', np.float64(0.8934970139349702), np.float64(0.00752)],
 ['pdays', np.float64(0.8944923689449237), np.float64(0.00653)],
 ['previous', np.float64(0.8946029639460297), np.float64(0.00641)],
 ['poutcome', np.float64(0.8932758239327583), np.float64(0.00774)]]

#### previous is the feature which has the smallest difference

#### Q6.

In [70]:
C = [0.01, 0.1, 1, 10, 100]
X_train, X_val, X_test = split_df(categories)
Y, X_train = prepare_X(X_train)
Y_val, X_val = prepare_X(X_val)
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, Y)
    y_pred = model.predict_proba(X_val)[:,1]
    churn_decision = (y_pred >= 0.5)
    accuracy = (Y_val == churn_decision).mean()
    print(f"Accuracy: {accuracy:.3f} C: {c}")
  

Accuracy: 0.899 C: 0.01
Accuracy: 0.900 C: 0.1
Accuracy: 0.901 C: 1
Accuracy: 0.901 C: 10
Accuracy: 0.901 C: 100


#### 1 is the best accuracy