In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd 
df = pd.read_csv('bank-additional-full.csv', sep=";")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [4]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan',
    'contact', 'month', 'day_of_week', 'poutcome']

numerical_columns = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 
    'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [5]:
num_category = {}

for item in categorical_columns:
       count_value = 0
       count_list = df[item].value_counts().to_numpy()
       for cat in count_list: 
              count_value += 1
       num_category[item] = count_value
       
num_category

{'job': 12,
 'marital': 4,
 'education': 8,
 'default': 3,
 'housing': 3,
 'loan': 3,
 'contact': 2,
 'month': 10,
 'day_of_week': 5,
 'poutcome': 3}

In [6]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint

In [7]:
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['y'])

X = df.drop(columns=['y', 'labels'])
y = df['labels']

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y, shuffle=True)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('standard_scaler', StandardScaler(), numerical_columns)
    ]
)

## <b>Logistic Regression Model</b>

In [10]:
logistic_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('logistic_regression', LogisticRegression())
    ]
)

logistic_param = logistic_param = {
    'logistic_regression__solver': ['liblinear', 'saga'],
    'logistic_regression__penalty': ['l1', 'l2'],
    'logistic_regression__C': [0.01, 0.1, 1, 10],
    'logistic_regression__max_iter': [500, 1000],
    'logistic_regression__class_weight': [None, 'balanced']
}


logistic_search = RandomizedSearchCV(
    estimator=logistic_pipeline,
    param_distributions=logistic_param,
    cv=5,
    n_iter=10,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='f1_macro'
)

logistic_search.fit(X_train, y_train)

print(f"Best Logistic Regression score : {logistic_search.best_score_}")
print(f"Best Logistic Regression Parametes : \n {logistic_search.best_params_}")

best_logistic = logistic_search.best_estimator_

logistic_pred = best_logistic.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, logistic_pred)}")
print(f"Confusion Matrix : \n{confusion_matrix(y_test, logistic_pred)}")
print(f"classification Report : \n{classification_report(y_test, logistic_pred)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Logistic Regression score : 0.7511651087481637
Best Logistic Regression Parametes : 
 {'logistic_regression__solver': 'liblinear', 'logistic_regression__penalty': 'l2', 'logistic_regression__max_iter': 500, 'logistic_regression__class_weight': 'balanced', 'logistic_regression__C': 10}
Accuracy Score : 0.8625880067977665
Confusion Matrix : 
[[9406 1559]
 [ 139 1253]]
classification Report : 
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     10965
           1       0.45      0.90      0.60      1392

    accuracy                           0.86     12357
   macro avg       0.72      0.88      0.76     12357
weighted avg       0.92      0.86      0.88     12357



## <b>XGBoost Model</b>

In [11]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    ))
])

xgb_param_dist = {
    'classifier__n_estimators': randint(200, 600),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__max_depth': randint(3, 8),
    'classifier__subsample': uniform(0.6, 0.4),
    'classifier__colsample_bytree': uniform(0.6, 0.4),
    'classifier__gamma': uniform(0, 5),
    'classifier__min_child_weight': randint(1, 6)
}

xgb_search = RandomizedSearchCV(
    xgb_pipeline,
    xgb_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)
print("\n🎯 Best XGBoost Params:", xgb_search.best_params_)

xgb_best = xgb_search.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)
print("\n📊 XGBoost (Tuned) Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("F1:", f1_score(y_test, y_pred_xgb, average='macro'))
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

Fitting 3 folds for each of 20 candidates, totalling 60 fits

🎯 Best XGBoost Params: {'classifier__colsample_bytree': np.float64(0.7824279936868144), 'classifier__gamma': np.float64(3.925879806965068), 'classifier__learning_rate': np.float64(0.04993475643167195), 'classifier__max_depth': 6, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 330, 'classifier__subsample': np.float64(0.9439761626945282)}

📊 XGBoost (Tuned) Results:
Accuracy: 0.9186695799951444
F1: 0.7759814151007809
              precision    recall  f1-score   support

           0       0.94      0.97      0.95     10965
           1       0.68      0.54      0.60      1392

    accuracy                           0.92     12357
   macro avg       0.81      0.75      0.78     12357
weighted avg       0.91      0.92      0.91     12357

[[10607   358]
 [  647   745]]


## <b>Gradient Boosting Model</b>

In [12]:
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

gb_param_dist = {
    'classifier__n_estimators': randint(100, 400),
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__max_depth': randint(2, 6),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__min_samples_split': randint(2, 10)
}

gb_search = RandomizedSearchCV(
    gb_pipeline,
    gb_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

gb_search.fit(X_train, y_train)
print("\n🎯 Best Gradient Boosting Params:", gb_search.best_params_)

gb_best = gb_search.best_estimator_
y_pred_gb = gb_best.predict(X_test)
print("\n📊 Gradient Boosting (Tuned) Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("F1:", f1_score(y_test, y_pred_gb, average='macro'))
print(classification_report(y_test, y_pred_gb))
print(confusion_matrix(y_test, y_pred_gb))

Fitting 3 folds for each of 20 candidates, totalling 60 fits

🎯 Best Gradient Boosting Params: {'classifier__learning_rate': np.float64(0.05158833257363777), 'classifier__max_depth': 5, 'classifier__min_samples_split': 7, 'classifier__n_estimators': 290, 'classifier__subsample': np.float64(0.9526854323784995)}

📊 Gradient Boosting (Tuned) Results:
Accuracy: 0.9191551347414421
F1: 0.7827113496866913
              precision    recall  f1-score   support

           0       0.95      0.96      0.95     10965
           1       0.67      0.56      0.61      1392

    accuracy                           0.92     12357
   macro avg       0.81      0.76      0.78     12357
weighted avg       0.91      0.92      0.92     12357

[[10575   390]
 [  609   783]]


## <b>Random Forest Model</b>

In [29]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_param_dist = {
    'classifier__n_estimators': randint(100, 500),
    'classifier__max_depth': randint(5, 20),
    'classifier__min_samples_split': randint(2, 10),
    'classifier__min_samples_leaf': randint(1, 5),
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__bootstrap': [True, False]
}

rf_search = RandomizedSearchCV(
    rf_pipeline,
    rf_param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
print("\n🎯 Best Random Forest Params:", rf_search.best_params_)

rf_best = rf_search.best_estimator_
y_pred_rf = rf_best.predict(X_test)
print("\n📊 Random Forest (Tuned) Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf, average='macro'))
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

Fitting 3 folds for each of 20 candidates, totalling 60 fits

🎯 Best Random Forest Params: {'classifier__bootstrap': True, 'classifier__max_depth': 7, 'classifier__max_features': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 266}

📊 Random Forest (Tuned) Results:
Accuracy: 0.9206117989803351
F1: 0.7848442623906641
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     10965
           1       0.68      0.56      0.61      1392

    accuracy                           0.92     12357
   macro avg       0.81      0.76      0.78     12357
weighted avg       0.92      0.92      0.92     12357

[[10596   369]
 [  612   780]]


## <b>More Classification Models</b>

You’ve already covered the “core four” for structured/tabular data:

* **Logistic Regression** → linear baseline
* **Random Forest** → bagging-based ensemble
* **Gradient Boosting / XGBoost** → boosting-based ensembles

But there are several **other strong classifiers** you can try — some classical, some modern. Here’s a quick breakdown by category 👇

---

## ⚙️ 1. Classic ML Models (fast and interpretable)

### 🔸 **Support Vector Machine (SVM)**

Excellent for smaller or medium datasets with clear margins between classes.

```python
from sklearn.svm import SVC

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42))
])

svm_param_dist = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto']
}
```

✅ Pros: Handles non-linear boundaries with kernel trick
⚠️ Cons: Slower on very large datasets (~40K rows might be borderline)

---

### 🔸 **K-Nearest Neighbors (KNN)**

Simple, non-parametric, distance-based method.

```python
from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

knn_param_dist = {
    'classifier__n_neighbors': [3, 5, 7, 9],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1=Manhattan, 2=Euclidean
}
```

✅ Pros: Easy to interpret, no training time
⚠️ Cons: Slow prediction on large datasets

---

## 🌳 2. Tree-based Boosting Variants

### 🔸 **LightGBM**

Highly optimized gradient boosting (faster than XGBoost on large tabular data).

```python
from lightgbm import LGBMClassifier

lgbm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42))
])
```

✅ Pros: Very fast, handles categorical features natively (if passed correctly)
⚠️ Cons: Slightly more sensitive to tuning

---

### 🔸 **CatBoost**

Performs extremely well with categorical features — often best out-of-the-box.

```python
from catboost import CatBoostClassifier

cat_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        verbose=0, random_state=42
    ))
])
```

✅ Pros: Handles categorical features and missing values automatically
⚠️ Cons: Slower to train than LightGBM, but great accuracy

---

## 🧠 3. Advanced / Hybrid Models

### 🔸 **Naive Bayes**

Quick baseline for text-like or categorical-heavy data.

```python
from sklearn.naive_bayes import GaussianNB

nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])
```

✅ Pros: Very fast, interpretable
⚠️ Cons: Strong independence assumptions — weaker on correlated features

---

### 🔸 **MLP (Neural Network)**

Scikit-learn’s `MLPClassifier` gives a shallow neural net baseline.

```python
from sklearn.neural_network import MLPClassifier

mlp_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(hidden_layer_sizes=(64, 32),
                                 activation='relu',
                                 solver='adam',
                                 max_iter=500,
                                 random_state=42))
])
```

✅ Pros: Can capture non-linear relationships
⚠️ Cons: Slower to tune, sensitive to scaling and regularization

---

## 🔍 Recommended Steps

1. **LightGBM** — fast, strong, robust
2. **CatBoost** — especially great for categorical-heavy datasets like yours
3. **SVM** — if you want a strong linear/nonlinear boundary benchmark
4. **MLP** — if you want a lightweight neural approach

---