In [None]:
# Import necessary libraries and preview dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/content/bank-additional.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [None]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [None]:
# Create dummy variables for categorical variables
df2 = pd.get_dummies(df, columns =['job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week','poutcome'],dtype = int)
df2.keys()

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'mon

In [None]:
# Convert target variable into binary
import numpy as np
df2['y_binary'] = np.where(df['y'] == 'yes', 1, 0)

In [None]:
# preview distribution of target variable (y)
df2['y_binary'].value_counts()

# Target classes are imbalanced

Unnamed: 0_level_0,count
y_binary,Unnamed: 1_level_1
0,3668
1,451


## Feature Engineering

In [None]:
# Never contacted: Re-coding pdays to determine if someone has or has not been contacted before this campaign
df2['never_contacted'] = (df2['pdays'] == 999).astype(int)

In [None]:
# Overcontacted: Those who are contacted more than 3 times during the campaign
df2['overcontacted'] = (df2['campaign'] > 3).astype(int)

In [None]:
# whether contacted in 30 days
df2['recent_contact_30'] = ((df2['pdays'] != 999) & (df2['pdays'] <= 30)).astype(int)

# whether contacted in 90 days
df2['recent_contact_90'] = ((df2['pdays'] != 999) & (df2['pdays'] <= 90)).astype(int)

# total contacts
df2['total_contacts'] = df2['campaign'] + df2['previous']

# whether is first time contect
df2['first_contact_overall'] = ((df2['previous'] == 0) & (df2['campaign'] == 1)).astype(int)

# Addressing Multicollinearity and Potential Data Leakage

In [None]:
# According to our correlation matrices, these variables are highly correlated with one another: Cons.price.x, emp var rate, and nr.employed.
# We will leave them in our data to see how the prediction differences between our baseline and the more complex models.

# Duration will be dropped because it is only available after the calls end, so data leakage would occur if we kept it in.

In [None]:
X = df2.drop(['y_binary','y','duration'], axis=1)
y = df2['y_binary']

## Setting Up Models

1. Logistic Regression

In [None]:
# Function to print metrics
def model_metrics(y_true, y_pred, y_prob, name):
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec    = precision_score(y_true, y_pred)
    rec     = recall_score(y_true, y_pred)
    f1      = f1_score(y_true, y_pred)
    auc     = roc_auc_score(y_true, y_prob)

    print(f"\n=== {name} ===")
    print("Accuracy:", acc)
    print("Balanced Accuracy:", bal_acc)
    print("Precision:",         prec)
    print("Recall:",            rec)
    print("F1 Score:",          f1)
    print("AUC:",               auc)

    return {
        "name": name,
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc
    }

In [None]:
# import logistic regression
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression(class_weight='balanced')

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=20)

# import metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# train and test
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
y_prob = logr.predict_proba(X_test)[:, 1]
log_results = model_metrics(y_test, y_pred,y_prob, name="Logistic Regression")


=== Logistic Regression ===
Accuracy: 0.7346278317152104
Balanced Accuracy: 0.7178120900191745
Precision: 0.24671916010498687
Recall: 0.6962962962962963
F1 Score: 0.3643410852713178
AUC: 0.7473677128536348


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#2. Logistic Regression with Regularization

In [None]:
# Scale features for all regularized logistic regression models
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 1) Ridge Logistic Regression (L2)
logit_ridge = LogisticRegression(
    penalty='l2',
    C=1,
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',
    random_state = 42
)

logit_ridge.fit(X_train_scaled, y_train)
y_pred_ridge = logit_ridge.predict(X_test_scaled)
y_prob_ridge = logit_ridge.predict_proba(X_test_scaled)[:, 1]

ridge_results = model_metrics(y_test, y_pred_ridge,y_prob_ridge,name="Ridge")


=== Ridge ===
Accuracy: 0.8033980582524272
Balanced Accuracy: 0.7206680795236653
Precision: 0.3029197080291971
Recall: 0.6148148148148148
F1 Score: 0.4058679706601467
AUC: 0.738594543680829


In [None]:
# 2) Lasso Logistic Regression (L1)
logit_lasso = LogisticRegression(
    penalty='l1',
    C=0.05,
    solver='liblinear',
    max_iter=1000,
    class_weight='balanced',
    random_state = 42
)

logit_lasso.fit(X_train_scaled, y_train)
y_pred_lasso = logit_lasso.predict(X_test_scaled)
y_prob_lasso = logit_lasso.predict_proba(X_test_scaled)[:, 1]

lasso_results = model_metrics(y_test, y_pred_lasso,y_prob_lasso,name="Lasso")


=== Lasso ===
Accuracy: 0.8220064724919094
Balanced Accuracy: 0.7278635583812696
Precision: 0.3293172690763052
Recall: 0.6074074074074074
F1 Score: 0.4270833333333333
AUC: 0.7453560736031217


In [None]:
# 3) Elastic Net Logistic Regression
logit_elastic = LogisticRegression(
    penalty='elasticnet',
    l1_ratio=0.5,
    C=1,
    solver='saga',
    max_iter=2000,
    class_weight='balanced',
    random_state = 42
)

logit_elastic.fit(X_train_scaled, y_train)
y_pred_elastic = logit_elastic.predict(X_test_scaled)
y_prob_elastic = logit_elastic.predict_proba(X_test_scaled)[:, 1]

elastic_results = model_metrics(y_test, y_pred_elastic,y_prob_elastic,name="Elastic Net")


=== Elastic Net ===
Accuracy: 0.8042071197411004
Balanced Accuracy: 0.7211222121303865
Precision: 0.304029304029304
Recall: 0.6148148148148148
F1 Score: 0.4068627450980392
AUC: 0.7387694688330474


In [None]:
# Results dataframe for Elastic Net, Ridge, and Lasso
results_df = pd.DataFrame([ridge_results, lasso_results, elastic_results])
print("\nModel Comparison:")
print(results_df.to_string(index=False))


Model Comparison:
       name  accuracy  balanced_accuracy  precision   recall       f1      auc
      Ridge  0.803398           0.720668   0.302920 0.614815 0.405868 0.738595
      Lasso  0.822006           0.727864   0.329317 0.607407 0.427083 0.745356
Elastic Net  0.804207           0.721122   0.304029 0.614815 0.406863 0.738769


## 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rforest = RandomForestClassifier(n_estimators = 1000,
                                 max_depth = 10,
                                 min_samples_split = 10,
                                 min_samples_leaf = 10,
                                 max_features='log2',
                                 class_weight = 'balanced')

rforest.fit(X_train, y_train)
y_pred = rforest.predict(X_test)
y_prob = rforest.predict_proba(X_test)[:, 1]
rf_results = model_metrics(y_test, y_pred,y_prob,name="Random Forest")


=== Random Forest ===
Accuracy: 0.8535598705501618
Balanced Accuracy: 0.7358260167524473
Precision: 0.3872549019607843
Recall: 0.5851851851851851
F1 Score: 0.46607669616519176
AUC: 0.7471322366871866
