In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")

## Loading the cleaned data

In [21]:
df_cleaned_final = pd.read_csv("df_cleaned_final.csv")
df_cleaned_final.head()

Unnamed: 0,age,job,marital,education,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y,PC1,PC2
0,56,housemaid,married,basic.4y,no,no,telephone,may,mon,261,1,0,nonexistent,0,-1.35089,0.681003
1,57,services,married,high.school,no,no,telephone,may,mon,149,1,0,nonexistent,0,-1.35089,0.681003
2,37,services,married,high.school,yes,no,telephone,may,mon,226,1,0,nonexistent,0,-1.35089,0.681003
3,40,admin.,married,basic.6y,no,no,telephone,may,mon,151,1,0,nonexistent,0,-1.35089,0.681003
4,56,services,married,high.school,no,yes,telephone,may,mon,307,1,0,nonexistent,0,-1.35089,0.681003


In [22]:
df_cleaned_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37089 entries, 0 to 37088
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          37089 non-null  int64  
 1   job          37089 non-null  object 
 2   marital      37089 non-null  object 
 3   education    37089 non-null  object 
 4   housing      37089 non-null  object 
 5   loan         37089 non-null  object 
 6   contact      37089 non-null  object 
 7   month        37089 non-null  object 
 8   day_of_week  37089 non-null  object 
 9   duration     37089 non-null  int64  
 10  campaign     37089 non-null  int64  
 11  previous     37089 non-null  int64  
 12  poutcome     37089 non-null  object 
 13  y            37089 non-null  int64  
 14  PC1          37089 non-null  float64
 15  PC2          37089 non-null  float64
dtypes: float64(2), int64(5), object(9)
memory usage: 4.5+ MB


In [23]:
print ("Number of unique values for each variable")
print("="*20)

for feature in df_cleaned_final.columns.values:
    print(f"{feature} ({df_cleaned_final[feature].dtypes}): {df_cleaned_final[feature].nunique()}")
    print("-"*20)

Number of unique values for each variable
age (int64): 78
--------------------
job (object): 11
--------------------
marital (object): 3
--------------------
education (object): 7
--------------------
housing (object): 2
--------------------
loan (object): 2
--------------------
contact (object): 2
--------------------
month (object): 10
--------------------
day_of_week (object): 5
--------------------
duration (int64): 753
--------------------
campaign (int64): 7
--------------------
previous (int64): 8
--------------------
poutcome (object): 3
--------------------
y (int64): 2
--------------------
PC1 (float64): 373
--------------------
PC2 (float64): 373
--------------------


In [24]:
X = df_cleaned_final.drop('y', axis=1)
y = df_cleaned_final['y']

## Preprocessing: One-hot encoding and ADASYN Over-sampling

### 1. One-hot encoding

In [25]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = X.select_dtypes(include="object").columns.values

# Perform one-hot encoding
encoder = OneHotEncoder(sparse=False,drop="first")
encoded_features = encoder.fit_transform(X[categorical_cols])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Merge the encoded DataFrame with the original DataFrame
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), encoded_df], axis=1)
X_encoded.head()

Unnamed: 0,age,duration,campaign,previous,PC1,PC2,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,0,-1.35089,0.681003,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,57,149,1,0,-1.35089,0.681003,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,37,226,1,0,-1.35089,0.681003,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,40,151,1,0,-1.35089,0.681003,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,56,307,1,0,-1.35089,0.681003,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### 2. ADASYN Oversampling

In [26]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_encoded, y)

## Train-Validation-Test split

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,stratify=y_resampled,random_state=24,shuffle=True,test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,stratify=y_train,random_state=25,shuffle=True,test_size=0.12)

## Searching for best model and best params

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

# Define the models and their corresponding hyperparameters for grid search
models = [
    {
        'name': 'Logistic Regression',
        'model': LogisticRegression(),
        'params': {'C': [0.1, 1, 10]}
    },
    {
        'name': 'Decision Tree Classifier',
        'model': DecisionTreeClassifier(),
        'params': {'max_depth': [None, 5, 10]}
    },
    {
        'name': 'Random Forest Classifier',
        'model': RandomForestClassifier(),
        'params': {'n_estimators': [100, 200, 300]}
    },
    {
        'name': 'KNN Classifier',
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7]}
    },
    {
        'name': 'XGBoost Classifier',
        'model': XGBClassifier(),
        'params': {'learning_rate': [0.1, 0.01], 'max_depth': [3, 5]}
    },
    {
        'name': 'Gradient Boosting Classifier',
        'model': GradientBoostingClassifier(),
        'params': {'learning_rate': [0.1, 0.01], 'n_estimators': [100, 200]}
    },
    {
        'name': 'Voting Classifier (Soft)',
        'model': VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('xgb', XGBClassifier())], voting='soft'),
        'params': {}
    },
    {
        'name': 'Voting Classifier (Hard)',
        'model': VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('xgb', XGBClassifier())], voting='hard'),
        'params': {}
    }
]

In [32]:
# Create empty DataFrames to store the metrics and best models
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score','ROC-AUC Score','Time taken (seconds)'])
best_models = {}
best_params = {}

# Perform GridSearchCV for each model
for model in models:
    print(f"Training {model['name']}...")
    start_time = time.time()
    grid_search = GridSearchCV(model['model'], model['params'], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Store the metrics in the metrics DataFrame
    y_pred = grid_search.predict(X_val)
    end_time = time.time()

    metrics_df.loc[len(metrics_df)] = [
        model['name'],
        accuracy_score(y_val, y_pred),
        precision_score(y_val, y_pred),
        recall_score(y_val, y_pred),
        f1_score(y_val, y_pred),
        roc_auc_score(y_val,y_pred),
        end_time-start_time
    ]
    # Store the best model and best parameters
    best_models[model['name']] = grid_search.best_estimator_
    best_params[model['name']] = grid_search.best_params_

    print("-"*35)

Training Logistic Regression...
-----------------------------------
Training Decision Tree Classifier...
-----------------------------------
Training Random Forest Classifier...
-----------------------------------
Training KNN Classifier...
-----------------------------------
Training XGBoost Classifier...
-----------------------------------
Training Gradient Boosting Classifier...
-----------------------------------
Training Voting Classifier (Soft)...
-----------------------------------
Training Voting Classifier (Hard)...
-----------------------------------


In [33]:
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC Score,Time taken (seconds)
0,Logistic Regression,0.874825,0.855947,0.899215,0.877048,0.874995,8.550731
1,Decision Tree Classifier,0.945908,0.940957,0.950706,0.945807,0.945941,5.9235
2,Random Forest Classifier,0.963835,0.971575,0.955102,0.963268,0.963774,199.967754
3,KNN Classifier,0.918628,0.860157,0.99843,0.92415,0.919184,15.897513
4,XGBoost Classifier,0.965394,0.973172,0.956672,0.964851,0.965333,50.124197
5,Gradient Boosting Classifier,0.961496,0.969629,0.952276,0.960874,0.961432,473.234498
6,Voting Classifier (Soft),0.961029,0.950568,0.972057,0.961192,0.961106,60.916773
7,Voting Classifier (Hard),0.964302,0.96478,0.963265,0.964022,0.964295,62.949557


## Checking performance of XGBoost Classifier and Hard Voting Classifier on test data

In [34]:
chosen_models = ["XGBoost Classifier","Voting Classifier (Hard)"]

for model in chosen_models:
    best_model = best_models[model].set_params(**best_params[model])
    y_pred = best_model.predict(X_test)

    print(model)
    print("-"*20)

    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test,y_pred)}")
    print(f"f1-score: {f1_score(y_test,y_pred)}")
    print(f"ROC-AUC score: {roc_auc_score(y_test,y_pred)}")

    print("="*40)


XGBoost Classifier
--------------------
Accuracy: 0.9622147399925177
Precision: 0.9715428395631441
Recall: 0.9517781796262809
f1-score: 0.961558955621527
ROC-AUC score: 0.9621426193123231
Voting Classifier (Hard)
--------------------
Accuracy: 0.9642349420127198
Precision: 0.9667980594299576
Recall: 0.9609704641350211
f1-score: 0.9638754534461911
ROC-AUC score: 0.9642123832043809


## Choosing Voting Classifier (Hard) as the best model for prediction