In [40]:
import pandas as pd
import numpy as np
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score

#### Data Preparation

In [3]:
df = pd.read_csv("data/bank-full.csv", sep=";")

In [8]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

#### Attribute information:

   Input variables:
   ###### bank client data:
   1 - age (numeric) 
   
   2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
   
   3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
   
   4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
   
   5 - default: has credit in default? (categorical: "no","yes","unknown")
   
   6 - housing: has housing loan? (categorical: "no","yes","unknown")
   
   7 - loan: has personal loan? (categorical: "no","yes","unknown")
   ###### related with the last contact of the current campaign:
   8 - contact: contact communication type (categorical: "cellular","telephone") 
   
   9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
   
  10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
  
  11 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
   ###### other attributes:
  12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  
  13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
  
  14 - previous: number of contacts performed before this campaign and for this client (numeric)
  
  15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")

  Output variable (desired target):
  21 - y - has the client subscribed a term deposit? (binary: "yes","no")


In [5]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no


In [6]:
df.tail(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45201,53,management,married,tertiary,no,583,no,no,cellular,17,nov,226,1,184,4,success,yes
45202,34,admin.,single,secondary,no,557,no,no,cellular,17,nov,224,1,-1,0,unknown,yes
45203,23,student,single,tertiary,no,113,no,no,cellular,17,nov,266,1,-1,0,unknown,yes
45204,73,retired,married,secondary,no,2850,no,no,cellular,17,nov,300,1,40,8,failure,yes
45205,25,technician,single,secondary,no,505,no,yes,cellular,17,nov,386,2,-1,0,unknown,yes
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


In [7]:
df.shape

(45211, 17)

In [9]:
df = df.drop(columns = ['default', 'loan'], axis =1)

In [11]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [13]:
# Question 1. Mode for the column `education`

df['education'].mode()[0]

'secondary'

In [14]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [19]:
numerical = df.select_dtypes(include=['number'])
print(numerical)

       age  balance  day  duration  campaign  pdays  previous
0       58     2143    5       261         1     -1         0
1       44       29    5       151         1     -1         0
2       33        2    5        76         1     -1         0
3       47     1506    5        92         1     -1         0
4       33        1    5       198         1     -1         0
...    ...      ...  ...       ...       ...    ...       ...
45206   51      825   17       977         3     -1         0
45207   71     1729   17       456         2     -1         0
45208   72     5715   17      1127         5    184         3
45209   57      668   17       508         4     -1         0
45210   37     2971   17       361         2    188        11

[45211 rows x 7 columns]


In [22]:
#Question 2. Two features with the biggest correlation

corr_matrix = numerical.corr()
print(corr_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [23]:
# Find the two features with the largest correlation
corr_pairs = corr_matrix.unstack().sort_values(kind="quicksort", ascending=False)
high_corr_pairs = corr_pairs[(corr_pairs != 1.0)]  # Remove self-correlation (1.0)

# Display the pair with the highest correlation
print(f"The two features with the highest correlation are: {high_corr_pairs.index[0]} with correlation {high_corr_pairs.iloc[0]}")

The two features with the highest correlation are: ('pdays', 'previous') with correlation 0.4548196354805043


#### Validation Framework

In [26]:
#Target Encoding
# Replace 'yes' with 1 and 'no' with 0 in the 'y' column
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

print(df['y'].value_counts())

y
0    39922
1     5289
Name: count, dtype: int64


In [27]:
# Data Splitting
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [28]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [29]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

#### Feature importance: Mutual information

In [31]:
numerical.columns

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')

In [57]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [48]:
categorical = ['job','marital','education','housing','contact','month','poutcome']

In [49]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [55]:
# Question 3. Variable with the biggest mutual information score

mi = df_full_train[categorical].apply(mutual_info_y_score)
print(round(mi.sort_values(ascending=False),2))

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64


#### Logistic Regression

In [58]:
#One-hot encoding

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [73]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

In [60]:
y_pred = model.predict_proba(X_val)[:, 1]

In [61]:
y_decision = (y_pred >= 0.5)

In [62]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_decision.astype(int)
df_pred['actual'] = y_val

In [66]:
df_pred.head(20)

Unnamed: 0,probability,prediction,actual
0,0.028795,0,0
1,0.022649,0,0
2,0.021287,0,0
3,0.014815,0,0
4,0.027066,0,0
5,0.036933,0,0
6,0.051717,0,0
7,0.034058,0,0
8,0.060787,0,0
9,0.006486,0,0


In [67]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [69]:
# Question 4. Accuracy on the validation dataset

round(df_pred.correct.mean(),1)

np.float64(0.9)

#### Feature Elimination

In [78]:
# Question 5. Feature with the smallest difference

# Baseline accuracy
y_pred_val = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred_val)
print(f'Baseline accuracy: {round(baseline_accuracy, 2)}')

# List of feature names
features = categorical + numerical 

# Dictionary to store accuracy differences for each feature
accuracy_diffs = {}

for i, feature in enumerate(features):
    # Step 1: Remove the i-th feature from the training and validation sets
    X_train_reduced = np.delete(X_train, i, axis=1)
    X_val_reduced = np.delete(X_val, i, axis=1)

    # Step 2: Train the model without the i-th feature
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    # Step 3: Predict on the validation set without the i-th feature
    y_pred_val_reduced = model_reduced.predict(X_val_reduced)

    # Step 4: Calculate accuracy without the i-th feature
    accuracy_reduced = accuracy_score(y_val, y_pred_val_reduced)

    # Step 5: Calculate accuracy difference and store it
    accuracy_diff = baseline_accuracy - accuracy_reduced
    accuracy_diffs[feature] = accuracy_diff

    print(f'Removed {feature}: Accuracy Difference = {round(accuracy_diff, 4)}')

# Step 6: Identify the least useful feature
least_useful_feature = min(accuracy_diffs, key=accuracy_diffs.get)
print(f'\nThe least useful feature is: {least_useful_feature}')


Baseline accuracy: 0.9
Removed job: Accuracy Difference = 0.0018
Removed marital: Accuracy Difference = 0.0001
Removed education: Accuracy Difference = 0.0013
Removed housing: Accuracy Difference = 0.0006
Removed contact: Accuracy Difference = 0.0008
Removed month: Accuracy Difference = 0.0006
Removed poutcome: Accuracy Difference = 0.0009
Removed age: Accuracy Difference = 0.0118
Removed balance: Accuracy Difference = 0.0014
Removed day: Accuracy Difference = 0.0004
Removed duration: Accuracy Difference = 0.0001
Removed campaign: Accuracy Difference = 0.0008
Removed pdays: Accuracy Difference = 0.0004
Removed previous: Accuracy Difference = 0.0002

The least useful feature is: marital


In [79]:
# Question 6. Smallest `C` that leads to the best accuracy on the validation set

# Values of C to try
C_values = [0.01, 0.1, 1, 10, 100]

# To store the accuracies for each value of C
accuracies = {}

# Iterate over the list of C values
for C in C_values:
    # Step 1: Train the model using regularized logistic regression
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Step 2: Make predictions on the validation set
    y_pred_val = model.predict(X_val)
    
    # Step 3: Calculate the accuracy
    accuracy = accuracy_score(y_val, y_pred_val)
    
    # Step 4: Store the accuracy rounded to 3 decimal places
    accuracies[C] = round(accuracy, 3)
    print(f'Accuracy for C={C}: {round(accuracy, 3)}')

# Step 5: Find the value of C with the best accuracy
best_C = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_C]

print(f"\nThe best value of C is: {best_C} with an accuracy of {best_accuracy}")


Accuracy for C=0.01: 0.895
Accuracy for C=0.1: 0.897
Accuracy for C=1: 0.899
Accuracy for C=10: 0.898
Accuracy for C=100: 0.898

The best value of C is: 1 with an accuracy of 0.899
