In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import shuffle

In [6]:
df = pd.read_csv("bank-full.csv", sep=';')   # original file uses semicolon separator

print("First 5 rows:")
print(df.head(), "\n")

print("Dataset Info:")
print(df.info(), "\n")

print("Missing values per column:")
print(df.isnull().sum(), "\n")

print("Target distribution (y):")
print(df['y'].value_counts(), "\n")

# Numerical summary
print("Numeric column summary:")
print(df.describe(), "\n")

# Categorical summary (unique values)
print("Unique values per categorical column:")
for col in df.select_dtypes('object').columns:
    print(f"{col}: {df[col].unique()}")
print("\n")

First 5 rows:
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no   

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data 

In [7]:
# Encode binary yes/no columns
binary_cols = ['default', 'housing', 'loan', 'y']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# Separate features and target
X = df.drop('y', axis=1)
y = df['y'].values

# One-hot encode categorical columns (non-binary)
cat_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Standardize numeric features
scaler = StandardScaler()
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
X_encoded[num_cols] = scaler.fit_transform(X_encoded[num_cols])

# Ensure all features are numeric
X_encoded = X_encoded.astype(float)

# Convert to NumPy array explicitly as float
X = np.asarray(X_encoded.values, dtype=float)

# Shuffle data
X, y = shuffle(X, y, random_state=42)


# Train-test split (80:20)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}\n")


Training samples: 36168, Test samples: 9043
Number of features: 42



In [8]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, weights):
    n = len(y)
    h = sigmoid(X.dot(weights))
    epsilon = 1e-8
    cost = -(1/n) * np.sum(y*np.log(h+epsilon) + (1-y)*np.log(1-h+epsilon))
    return cost

def gradient_descent(X, y, lr=0.01, epochs=2000):
    n, m = X.shape
    weights = np.zeros(m)
    for i in range(epochs):
        h = sigmoid(X.dot(weights))
        gradient = (1/n) * X.T.dot(h - y)
        weights -= lr * gradient
        if i % 500 == 0:
            loss = compute_cost(X, y, weights)
            print(f"Epoch {i:4d}  Loss: {loss:.6f}")
    return weights

def predict(X, weights, threshold=0.5):
    return (sigmoid(X.dot(weights)) >= threshold).astype(int)

# Add intercept term
X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

# Train model
print("Training Logistic Regression model...\n")
weights = gradient_descent(X_train_bias, y_train, lr=0.05, epochs=3000)

Training Logistic Regression model...

Epoch    0  Loss: 0.667529
Epoch  500  Loss: 0.265251
Epoch 1000  Loss: 0.258293
Epoch 1500  Loss: 0.254255
Epoch 2000  Loss: 0.251468
Epoch 2500  Loss: 0.249413


In [9]:
y_pred = predict(X_test_bias, weights)

print("\nðŸ”¹ Evaluation Metrics (on Test Set):\n")
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision : {precision_score(y_test, y_pred):.4f}")
print(f"Recall    : {recall_score(y_test, y_pred):.4f}")
print(f"F1-score  : {f1_score(y_test, y_pred):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))


ðŸ”¹ Evaluation Metrics (on Test Set):

Accuracy  : 0.9018
Precision : 0.6827
Recall    : 0.3076
F1-score  : 0.4241

Confusion Matrix:
[[7828  152]
 [ 736  327]]

Classification Report:
              precision    recall  f1-score   support

          No       0.91      0.98      0.95      7980
         Yes       0.68      0.31      0.42      1063

    accuracy                           0.90      9043
   macro avg       0.80      0.64      0.69      9043
weighted avg       0.89      0.90      0.88      9043

