In [1]:
import config_cat_embedding
import numpy as np
import pandas as pd
import random
import time

from gensim.models import Word2Vec
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
import networkx as nx
from node2vec import Node2Vec
import matplotlib.pyplot as plt

from scipy.sparse import coo_matrix, csr_matrix

from tqdm.notebook import tqdm
from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network

from scipy import stats  # For confidence intervals

# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)

def create_cooccurrence_matrix_diag(df_cat):
    # Get unique categories
    categories = np.unique(df_cat)

    # Create a dictionary to map category to index
    category_to_index = {cat: i for i, cat in enumerate(categories)}

    # Calculate co-occurrence matrix
    matrix = np.zeros((len(categories), len(categories)))
    for row in df_cat.itertuples(index=False):
        for i, cat1 in enumerate(row):
            for j, cat2 in enumerate(row):
                if i != j:
                    matrix[category_to_index[cat1], category_to_index[cat2]] += 1

    # Calculate the diagonal values
    diagonal_values = []
    for cat in categories:
        count = df_cat.stack().value_counts()[cat]
        diagonal_values.append(count)

    # Set diagonal values to category counts
    np.fill_diagonal(matrix, diagonal_values)

    # Convert the matrix to a sparse matrix
    sparse_matrix = coo_matrix(matrix)

    # Normalize the matrix
    normalized_matrix = csr_matrix(sparse_matrix / sparse_matrix.sum(axis=1))

    # Convert to DataFrame for easier handling
    co_occurrence_matrix_norm = pd.DataFrame(normalized_matrix.toarray(), index=categories, columns=categories)

    return co_occurrence_matrix_norm

# get_embeddings function outside the loop
def get_embeddings(df, model):
    embeddings = []
    for index, row in df.iterrows():
        node_vectors = []
        for cat in row:
            if str(cat) in model.wv:
                node_vectors.append(model.wv[str(cat)])
            else:
                node_vectors.append(np.zeros(model.vector_size))
        embeddings.append(np.mean(node_vectors, axis=0))
    return np.array(embeddings)

# =============================================
# Start of the adapted code for cross-validation
# =============================================

# Load and preprocess data
data_path = config_cat_embedding.paths['data']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y

# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    ('MLP', KerasClassifier(
        model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Loop over models
for name, model in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []

    # Start the timer before cross-validation
    start_time = time.time()

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split the data into training and testing sets for this fold
        X_train_fold, X_test_fold = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        # Preprocess data within the fold
        # Handle categorical variables
        cat_data_train = X_train_fold[cat_cols]
        cat_data_test = X_test_fold[cat_cols]

        # Create co-occurrence matrix using the training data
        co_occurrence_matrix = create_cooccurrence_matrix_diag(cat_data_train)

        # Use Node2Vec or another method to create embeddings from the co-occurrence matrix
        # Here, we will create a graph and compute embeddings
        G = nx.from_pandas_adjacency(co_occurrence_matrix)

        node2vec = Node2Vec(G, dimensions=50, walk_length=10, num_walks=20, workers=4, seed=seed)
        node2vec_model = node2vec.fit(window=10, min_count=1, batch_words=4)

        # Map embeddings to the categorical data
        X_train_emb = get_embeddings(cat_data_train, node2vec_model)
        X_test_emb = get_embeddings(cat_data_test, node2vec_model)

        # If there are numerical features, include them
        num_cols = X_train_fold.drop(columns=cat_cols).columns.tolist()
        if num_cols:
            num_data_train = X_train_fold[num_cols].values
            num_data_test = X_test_fold[num_cols].values

            # Combine numerical and embedding features
            X_train_combined = np.hstack((X_train_emb, num_data_train))
            X_test_combined = np.hstack((X_test_emb, num_data_test))
        else:
            X_train_combined = X_train_emb
            X_test_combined = X_test_emb

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            # Update the model with the correct number of features
            model.set_params(model__number_of_features=number_of_features)

        # Fit the model
        model.fit(X_train_scaled, y_train_fold)
        # Predict on the test fold
        y_pred_fold = model.predict(X_test_scaled)

        # Get prediction probabilities for ROC AUC
        if hasattr(model, "predict_proba"):
            y_pred_prob_fold = model.predict_proba(X_test_scaled)[:, 1]
        else:
            # For classifiers without predict_proba, use decision_function
            y_pred_prob_fold = model.decision_function(X_test_scaled)
            # Normalize the decision function output to [0,1] range
            y_pred_prob_fold = (y_pred_prob_fold - y_pred_prob_fold.min()) / (y_pred_prob_fold.max() - y_pred_prob_fold.min())

        # Calculate metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        # print(f"Fold {fold} completed.")
        fold += 1

    # Stop the timer after cross-validation
    end_time = time.time()
    total_computation_time = end_time - start_time  # Total time for the model

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)

    # Print results
    print(f"Accuracy: {acc_mean:.3f} (95% CI: {acc_ci_lower:.3f} - {acc_ci_upper:.3f})")
    print(f"Precision: {prec_mean:.3f} (95% CI: {prec_ci_lower:.3f} - {prec_ci_upper:.3f})")
    print(f"Recall: {rec_mean:.3f} (95% CI: {rec_ci_lower:.3f} - {rec_ci_upper:.3f})")
    print(f"F1 Score: {f1_mean:.3f} (95% CI: {f1_ci_lower:.3f} - {f1_ci_upper:.3f})")
    print(f"ROC AUC: {roc_mean:.3f} (95% CI: {roc_ci_lower:.3f} - {roc_ci_upper:.3f})")
    print(f"Total Computation Time: {total_computation_time:.3f} seconds\n")


Classifier: LR


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.847 (95% CI: 0.844 - 0.850)
Precision: 0.734 (95% CI: 0.726 - 0.742)
Recall: 0.601 (95% CI: 0.591 - 0.611)
F1 Score: 0.661 (95% CI: 0.653 - 0.669)
ROC AUC: 0.902 (95% CI: 0.899 - 0.906)
Total Computation Time: 371.079 seconds

Classifier: DT


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.813 (95% CI: 0.806 - 0.820)
Precision: 0.722 (95% CI: 0.656 - 0.789)
Recall: 0.491 (95% CI: 0.411 - 0.570)
F1 Score: 0.544 (95% CI: 0.484 - 0.604)
ROC AUC: 0.830 (95% CI: 0.822 - 0.837)
Total Computation Time: 379.433 seconds

Classifier: RF


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.836 (95% CI: 0.834 - 0.839)
Precision: 0.768 (95% CI: 0.758 - 0.779)
Recall: 0.487 (95% CI: 0.474 - 0.501)
F1 Score: 0.596 (95% CI: 0.586 - 0.606)
ROC AUC: 0.887 (95% CI: 0.884 - 0.890)
Total Computation Time: 888.065 seconds

Classifier: KNN


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.815 (95% CI: 0.813 - 0.818)
Precision: 0.642 (95% CI: 0.635 - 0.649)
Recall: 0.578 (95% CI: 0.569 - 0.587)
F1 Score: 0.608 (95% CI: 0.602 - 0.615)
ROC AUC: 0.815 (95% CI: 0.810 - 0.820)
Total Computation Time: 399.483 seconds

Classifier: XGB


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.865 (95% CI: 0.862 - 0.868)
Precision: 0.771 (95% CI: 0.763 - 0.779)
Recall: 0.651 (95% CI: 0.642 - 0.660)
F1 Score: 0.706 (95% CI: 0.699 - 0.712)
ROC AUC: 0.921 (95% CI: 0.919 - 0.923)
Total Computation Time: 448.455 seconds

Classifier: MLP


Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/97 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/98 [00:00<?, ?it/s]

Accuracy: 0.847 (95% CI: 0.843 - 0.851)
Precision: 0.729 (95% CI: 0.716 - 0.741)
Recall: 0.611 (95% CI: 0.593 - 0.629)
F1 Score: 0.664 (95% CI: 0.653 - 0.674)
ROC AUC: 0.903 (95% CI: 0.899 - 0.906)
Total Computation Time: 2248.979 seconds

