In [4]:
# Add confidence interval
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network

# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Load and preprocess data
data_path = config_cat_embedding.paths['data']
bank_data = pd.read_csv(data_path + 'bank-additional-full.csv', sep=';')

df_bank, cat_cols = bank_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y

# Convert target variable to numeric if necessary
# Assuming 'y' contains 'yes'/'no', map them to 1/0

# Define the classifiers
seed = 42
# We will determine the number_of_features inside the cross-validation loop after preprocessing

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(
        model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Loop over models
for name, model in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    
    # Start the timer before cross-validation
    start_time = time.time()
    
    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split the data into training and testing sets for this fold
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        # Preprocess data within the fold
        # Hashing encoding
        import category_encoders as ce
        n_components = 9  
        hashing_encoder = ce.HashingEncoder(cols=cat_cols, n_components=n_components, return_df=True)
        X_train_hash = hashing_encoder.fit_transform(X_train_fold, y_train_fold)
        X_test_hash = hashing_encoder.transform(X_test_fold)
        
        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_hash)
        X_test_scaled = stc.transform(X_test_hash)
        
        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            # Update the model with the correct number of features
            model.set_params(model__number_of_features=number_of_features)
        
        # Fit the model
        model.fit(X_train_scaled, y_train_fold)
        # Predict on the test fold
        y_pred_fold = model.predict(X_test_scaled)
        
        # Get prediction probabilities for ROC AUC
        if hasattr(model, "predict_proba"):
            y_pred_prob_fold = model.predict_proba(X_test_scaled)[:, 1]
        else:
            # For classifiers without predict_proba, use decision_function
            y_pred_prob_fold = model.decision_function(X_test_scaled)
            # Normalize the decision function output to [0,1] range
            y_pred_prob_fold = (y_pred_prob_fold - y_pred_prob_fold.min()) / (y_pred_prob_fold.max() - y_pred_prob_fold.min())
        
        # Calculate metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))
        
        # print(f"Fold {fold} completed.")
        fold += 1
    
    # Stop the timer after cross-validation
    end_time = time.time()
    total_computation_time = end_time - start_time  # Total time for the model
    
    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    
    # Print results
    print(f"Accuracy: {acc_mean:.3f} (95% CI: {acc_ci_lower:.3f} - {acc_ci_upper:.3f})")
    print(f"Precision: {prec_mean:.3f} (95% CI: {prec_ci_lower:.3f} - {prec_ci_upper:.3f})")
    print(f"Recall: {rec_mean:.3f} (95% CI: {rec_ci_lower:.3f} - {rec_ci_upper:.3f})")
    print(f"F1 Score: {f1_mean:.3f} (95% CI: {f1_ci_lower:.3f} - {f1_ci_upper:.3f})")
    print(f"ROC AUC: {roc_mean:.3f} (95% CI: {roc_ci_lower:.3f} - {roc_ci_upper:.3f})")
    print(f"Total Computation Time: {total_computation_time:.3f} seconds\n")


Classifier: LR
Accuracy: 0.905 (95% CI: 0.902 - 0.907)
Precision: 0.649 (95% CI: 0.631 - 0.667)
Recall: 0.330 (95% CI: 0.311 - 0.349)
F1 Score: 0.437 (95% CI: 0.417 - 0.456)
ROC AUC: 0.883 (95% CI: 0.878 - 0.889)
Total Computation Time: 60.075 seconds

Classifier: DT
Accuracy: 0.906 (95% CI: 0.904 - 0.908)
Precision: 0.650 (95% CI: 0.632 - 0.668)
Recall: 0.358 (95% CI: 0.342 - 0.374)
F1 Score: 0.461 (95% CI: 0.445 - 0.478)
ROC AUC: 0.842 (95% CI: 0.835 - 0.849)
Total Computation Time: 46.349 seconds

Classifier: RF
Accuracy: 0.900 (95% CI: 0.898 - 0.902)
Precision: 0.764 (95% CI: 0.739 - 0.789)
Recall: 0.160 (95% CI: 0.144 - 0.176)
F1 Score: 0.263 (95% CI: 0.241 - 0.285)
ROC AUC: 0.882 (95% CI: 0.876 - 0.888)
Total Computation Time: 121.604 seconds

Classifier: KNN
Accuracy: 0.890 (95% CI: 0.887 - 0.893)
Precision: 0.519 (95% CI: 0.498 - 0.540)
Recall: 0.339 (95% CI: 0.319 - 0.359)
F1 Score: 0.409 (95% CI: 0.390 - 0.429)
ROC AUC: 0.759 (95% CI: 0.749 - 0.768)
Total Computation Time: 79

In [None]:
import config_cat_embedding
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm_notebook as tqdm
import logging

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
import time
import random

# Set up logging

In [None]:
import config_cat_embedding
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm_notebook as tqdm
import logging

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
import time
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)

data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path + 'bank-additional-full.csv', sep=';')

df_bank, cat_cols = bank_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
target = df_bank.y

# Define number of runs for repeated train-test splits
n_runs = 5
seed = 42

# One-hot encoding
import category_encoders as ce
hash_encoder = ce.HashingEncoder(cols=cat_cols, n_components=2)#9

# Standard scaling
stc = StandardScaler()

# Define the classifiers
number_of_features = X.shape[1]
models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed))
   # ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    #('KNN', KNeighborsClassifier(n_neighbors=3)),
    #('XGB', XGBClassifier(eval_metric='logloss', use_label_encoder=False)),
    #('SVM', SVC(kernel='linear', random_state=seed, probability=True))
   # ('MLP', KerasClassifier(build_fn=lambda: create_network(number_of_features), epochs=100, batch_size=100, verbose=0))
]

# Initialize dictionary to store results
results = {}

# Train and evaluate the models multiple times
for name, model in models:
    logging.info(f"Classifier: {name}")
    print(f"Classifier: {name}")
    # Lists to store metrics for each run
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_auc_scores = []
    computation_times = []
    
    # Repeat train-test split n_runs times
    for run in range(n_runs):
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=seed + run)
        
        # One-hot encoding on the training and test data
        X_train_ohe = hash_encoder.fit_transform(X_train)
        X_test_ohe = hash_encoder.transform(X_test)
        
        # Standard scaling
        X_train_scaled = stc.fit_transform(X_train_ohe)
        X_test_scaled = stc.transform(X_test_ohe)
        
        # Train the model
        start_time = time.time()  # Start time
        model.fit(X_train_scaled, y_train)
        end_time = time.time()  # End time
        
        # Predictions
        y_pred = model.predict(X_test_scaled)
        y_pred_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Computation time
        computation_time = end_time - start_time
        
        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, zero_division=0))
        recall_scores.append(recall_score(y_test, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_test, y_pred, zero_division=0))
        if y_pred_prob is not None:
            roc_auc_scores.append(roc_auc_score(y_test, y_pred_prob))
        else:
            roc_auc_scores.append(None)
        computation_times.append(computation_time)
    
    # Compute mean and standard deviation for all metrics
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)
    mean_precision = np.mean(precision_scores)
    std_precision = np.std(precision_scores)
    mean_recall = np.mean(recall_scores)
    std_recall = np.std(recall_scores)
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    mean_roc_auc = np.mean([score for score in roc_auc_scores if score is not None]) if any(score is not None for score in roc_auc_scores) else None
    std_roc_auc = np.std([score for score in roc_auc_scores if score is not None]) if any(score is not None for score in roc_auc_scores) else None
    mean_time = np.mean(computation_times)
    std_time = np.std(computation_times)
    
    # Store results in a matrix format
    results[name] = [
        ["Accuracy", mean_accuracy, std_accuracy],
        ["Precision", mean_precision, std_precision],
        ["Recall", mean_recall, std_recall],
        ["F1 Score", mean_f1, std_f1],
        ["ROC AUC", mean_roc_auc, std_roc_auc],
        ["Computation Time (s)", mean_time, std_time]
    ]
    
    # Log results in a matrix format
    logging.info(f"Results for {name}:")
    for metric in results[name]:
        logging.info(f"{metric[0]}: {metric[1]:.3f} ± {metric[2]:.3f}")
    logging.info("\n")

In [1]:
import config_cat_embedding
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from tqdm import tqdm_notebook as tqdm

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
import time
import random

# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)

data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path + 'bank-additional-full.csv', sep=';')

df_bank, cat_cols = bank_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
target = df_bank.y

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=1500)

# One-hot encoding
import category_encoders as ce
hash_encoder=ce.HashingEncoder(cols=cat_cols,n_components=9)
X_train_ohe = hash_encoder.fit_transform(X_train)
X_test_ohe = hash_encoder.transform(X_test)  # Use transform() instead of fit_transform()

# Standard scaling
stc = StandardScaler()
X_train_scaled = stc.fit_transform(X_train_ohe)
X_test_scaled = stc.transform(X_test_ohe)

number_of_features = X_train_scaled.shape[1]  # Number of features in the input data

# Define the classifiers
seed = 42
models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', use_label_encoder=False)),
    ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(build_fn=lambda: create_network(number_of_features), epochs=100, batch_size=100, verbose=0))
]

# Train and evaluate the models
for name, model in models:
    start_time = time.time()  # Start time
    
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_prob = model.predict_proba(X_test_scaled)
    
    end_time = time.time()  # End time
    computation_time = end_time - start_time  # Computation time
    
    print(f"Classifier: {name}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))  # Set digits to 3 for three decimal places
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_prob[:, 1]):.3f}")
    print(f"Computation Time: {computation_time:.3f} seconds")
    print()

  ('MLP', KerasClassifier(build_fn=lambda: create_network(number_of_features), epochs=100, batch_size=100, verbose=0))


Classifier: LR
[[7126  168]
 [ 637  307]]
              precision    recall  f1-score   support

           0      0.918     0.977     0.947      7294
           1      0.646     0.325     0.433       944

    accuracy                          0.902      8238
   macro avg      0.782     0.651     0.690      8238
weighted avg      0.887     0.902     0.888      8238

ROC AUC Score: 0.886
Computation Time: 0.032 seconds

Classifier: DT
[[7106  188]
 [ 618  326]]
              precision    recall  f1-score   support

           0      0.920     0.974     0.946      7294
           1      0.634     0.345     0.447       944

    accuracy                          0.902      8238
   macro avg      0.777     0.660     0.697      8238
weighted avg      0.887     0.902     0.889      8238

ROC AUC Score: 0.839
Computation Time: 0.027 seconds

Classifier: RF
[[7253   41]
 [ 777  167]]
              precision    recall  f1-score   support

           0      0.903     0.994     0.947      7294
   