In [71]:
pip install scikit-optimize



In [72]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer
from skopt import BayesSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np

In [83]:
# Load the training and testing datasets from provided locations
from google.colab import drive # Load data
drive.mount('/content/drive')

train_data = pd.read_csv('/content/drive/MyDrive/Masters/ML_Assignment2/Portugese Bank Data - TRAIN.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Masters/ML_Assignment2/Portugese Bank Data - TEST.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [84]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [85]:
# Separate features and target variable
X_train = train_data.drop(columns=['y'])
y_train= train_data['y']
X_test = test_data.drop(columns=['y'])
y_test = test_data['y']

# one-hot encoded - Convert categorical variables into dummy/indicator variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure columns match in training and testing sets
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [86]:
#Default Decision Tree Classifier ========================================================================
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (test set) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))

clfr = RandomForestClassifier()
clfr.fit(X_train, y_train)
clfr_predict=clfr.predict(X_test)
print("accuracy Score (test set) for  Random Forest classifier:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix for Random Forest classifier")
print(confusion_matrix(y_test,clfr_predict))



accuracy Score (test set) for Decision Tree:0.885492
Confusion Matrix for Decision Tree
[[37288  2634]
 [ 2543  2746]]
accuracy Score (test set) for  Random Forest classifier:0.911614
Confusion Matrix for Random Forest classifier
[[39237   685]
 [ 3311  1978]]


In [96]:
# Hyperparameters for Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Hyperparameters for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [97]:
# Create a custom F1 scoring function with pos_label set correctly
f1_scorer = make_scorer(f1_score, pos_label='yes')

In [98]:
# Create and run Randomized Search
dt_random_search = RandomizedSearchCV(
    DecisionTreeClassifier(),
    param_distributions=dt_param_grid,
    n_iter=10,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1,
    random_state=42
)

rf_random_search = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=rf_param_grid,
    n_iter=10,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1,
    random_state=42
)

In [99]:
# Run Grid Search
dt_grid_search = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=dt_param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1
)

rf_grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid=rf_param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1
)

In [100]:
# Run Bayesian Search using skopt's BayesSearchCV
dt_bayes_search = BayesSearchCV(
    estimator=DecisionTreeClassifier(),
    search_spaces=dt_param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1,
    random_state=42
)

rf_bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(),
    search_spaces=rf_param_grid,
    cv=5,
    scoring=f1_scorer,
    n_jobs=-1,
    random_state=42
)

In [101]:
# Fit models
print("Fitting Decision Tree models...")
dt_random_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)
dt_bayes_search.fit(X_train, y_train)

print("Fitting Random Forest models...")
rf_random_search.fit(X_train, y_train)
rf_grid_search.fit(X_train, y_train)
rf_bayes_search.fit(X_train, y_train)

Fitting Decision Tree models...




Fitting Random Forest models...




In [102]:
# Get the best models from each search
best_dt_random = dt_random_search.best_estimator_
best_dt_grid = dt_grid_search.best_estimator_
best_dt_bayes = dt_bayes_search.best_estimator_

best_rf_random = rf_random_search.best_estimator_
best_rf_grid = rf_grid_search.best_estimator_
best_rf_bayes = rf_bayes_search.best_estimator_

In [103]:
# Print the best score parameters
print("Random Seacrch Decision Tree ",best_dt_random)
print ("Random Seacrch Random Forest ",best_rf_random)

print("Bayesian Seacrch Decision Tree  Best F1 score",best_dt_bayes)
print ("Bayesian Seacrch Random Forest  Best F1 score",best_rf_bayes)

print("Grid Seacrch Decision Tree  Best F1 score",best_dt_grid)
print ("Grid Seacrch Random Forest  Best F1 score",best_rf_grid)

Random Seacrch Decision Tree  DecisionTreeClassifier(criterion='entropy', max_depth=15)
Random Seacrch Random Forest  RandomForestClassifier(bootstrap=False)
Bayesian Seacrch Decision Tree  Best F1 score DecisionTreeClassifier(criterion='entropy', max_depth=15, min_samples_split=5)
Bayesian Seacrch Random Forest  Best F1 score RandomForestClassifier(bootstrap=False, n_estimators=150)
Grid Seacrch Decision Tree  Best F1 score DecisionTreeClassifier(max_depth=15)
Grid Seacrch Random Forest  Best F1 score RandomForestClassifier(bootstrap=False, n_estimators=150)


In [104]:
# Evaluate models on the test set
def evaluate_model(model, X_test, y_test, X_train, y_train, cv=5):
    y_pred = model.predict(X_test)

    cross_val_f1 = cross_val_score(model, X_train, y_train, cv=cv, scoring="balanced_accuracy")
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, pos_label='yes'),
        'Recall': recall_score(y_test, y_pred, pos_label='yes'),
        'F1 Score': f1_score(y_test, y_pred, pos_label='yes'),
        'Confusion Matrix': confusion_matrix(y_test, y_pred).tolist(), # Converting to list for easy printing
        'Cross-Val F1 Score Mean': np.mean(cross_val_f1)
    }

# Compile results for comparison
results = {
    'DT Random': evaluate_model(best_dt_random, X_test, y_test, X_train, y_train, cv=5),
    'DT Grid': evaluate_model(best_dt_grid, X_test, y_test, X_train, y_train, cv=5),
    'DT Bayes': evaluate_model(best_dt_bayes, X_test, y_test, X_train, y_train, cv=5),
    'RF Random': evaluate_model(best_rf_random, X_test, y_test, X_train, y_train, cv=5),
    'RF Grid': evaluate_model(best_rf_grid, X_test, y_test, X_train, y_train, cv=5),
    'RF Bayes': evaluate_model(best_rf_bayes, X_test, y_test, X_train, y_train, cv=5)
}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

           Accuracy Precision    Recall  F1 Score  \
DT Random  0.892305  0.550311  0.434298   0.48547   
DT Grid    0.894871  0.560909  0.466629  0.509444   
DT Bayes   0.894052  0.562204  0.426357  0.484946   
RF Random    0.9123  0.708701  0.425033  0.531379   
RF Grid    0.912588   0.71095  0.425978   0.53275   
RF Bayes   0.912234  0.704806   0.42976  0.533944   

                        Confusion Matrix Cross-Val F1 Score Mean  
DT Random  [[38045, 1877], [2992, 2297]]                0.679038  
DT Grid    [[37990, 1932], [2821, 2468]]                 0.68348  
DT Bayes   [[38166, 1756], [3034, 2255]]                0.672227  
RF Random   [[38998, 924], [3041, 2248]]                0.633767  
RF Grid     [[39006, 916], [3036, 2253]]                0.625957  
RF Bayes    [[38970, 952], [3016, 2273]]                0.635402  
