In [1]:
# Add confidence interval with dimension 5
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)
        print("##############################")
        print(X_train_fold['stringcat'].iloc[0])
        
        # Train Word2Vec model on training data
        dimpool = 50  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male 

In [None]:
# Add confidence interval with dimension 5
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)
        print("##############################")
        print(X_train_fold['stringcat'].iloc[0])
        
        # Train Word2Vec model on training data
        dimpool = 50  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male 

In [2]:
# Add confidence interval with dimension 5
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
   # ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
   # ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed))
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
   # ('MLP', KerasClassifier(
    #    model=create_network,
      #  epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)
        print("##############################")
        print(X_train_fold['stringcat'].iloc[0])
        
        # Train Word2Vec model on training data
        dimpool = 50  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male United-States
##############################
Private 11th Never-married Machine-op-inspct Own-child Black Male 

In [5]:
# Add confidence interval with dimension 5
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(
        model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)

        # Train Word2Vec model on training data
        dimpool = 10  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
Accuracy: 0.8483 (95% CI: 0.8446 - 0.8520)
Precision: 0.7364 (95% CI: 0.7272 - 0.7455)
Recall: 0.6044 (95% CI: 0.5962 - 0.6125)
F1 Score: 0.6639 (95% CI: 0.6556 - 0.6722)
ROC AUC: 0.9046 (95% CI: 0.8995 - 0.9097)
Average Computation Time per Fold: 1.0897 seconds

Classifier: DT
Accuracy: 0.8404 (95% CI: 0.8372 - 0.8435)
Precision: 0.7695 (95% CI: 0.7576 - 0.7815)
Recall: 0.5081 (95% CI: 0.5037 - 0.5125)
F1 Score: 0.6121 (95% CI: 0.6055 - 0.6187)
ROC AUC: 0.8606 (95% CI: 0.8564 - 0.8648)
Average Computation Time per Fold: 0.2175 seconds

Classifier: RF
Accuracy: 0.8420 (95% CI: 0.8368 - 0.8473)
Precision: 0.7618 (95% CI: 0.7499 - 0.7737)
Recall: 0.5275 (95% CI: 0.5125 - 0.5424)
F1 Score: 0.6233 (95% CI: 0.6090 - 0.6376)
ROC AUC: 0.8967 (95% CI: 0.8915 - 0.9020)
Average Computation Time per Fold: 5.7035 seconds

Classifier: KNN
Accuracy: 0.8168 (95% CI: 0.8094 - 0.8242)
Precision: 0.6443 (95% CI: 0.6289 - 0.6597)
Recall: 0.5819 (95% CI: 0.5613 - 0.6026)
F1 Score: 0.6115 (9

In [4]:
# Add confidence interval with dimension 5
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(
        model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)

        # Train Word2Vec model on training data
        dimpool = 5  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
Accuracy: 0.8467 (95% CI: 0.8423 - 0.8511)
Precision: 0.7392 (95% CI: 0.7312 - 0.7472)
Recall: 0.5894 (95% CI: 0.5742 - 0.6046)
F1 Score: 0.6558 (95% CI: 0.6436 - 0.6681)
ROC AUC: 0.9022 (95% CI: 0.8960 - 0.9083)
Average Computation Time per Fold: 0.4866 seconds

Classifier: DT
Accuracy: 0.8402 (95% CI: 0.8366 - 0.8438)
Precision: 0.7687 (95% CI: 0.7551 - 0.7824)
Recall: 0.5081 (95% CI: 0.5037 - 0.5125)
F1 Score: 0.6118 (95% CI: 0.6046 - 0.6190)
ROC AUC: 0.8605 (95% CI: 0.8562 - 0.8649)
Average Computation Time per Fold: 0.0880 seconds

Classifier: RF
Accuracy: 0.8434 (95% CI: 0.8410 - 0.8459)
Precision: 0.7639 (95% CI: 0.7520 - 0.7758)
Recall: 0.5333 (95% CI: 0.5134 - 0.5531)
F1 Score: 0.6279 (95% CI: 0.6165 - 0.6393)
ROC AUC: 0.8987 (95% CI: 0.8943 - 0.9031)
Average Computation Time per Fold: 3.5121 seconds

Classifier: KNN
Accuracy: 0.8190 (95% CI: 0.8118 - 0.8262)
Precision: 0.6483 (95% CI: 0.6328 - 0.6639)
Recall: 0.5892 (95% CI: 0.5718 - 0.6066)
F1 Score: 0.6173 (9

In [3]:
# Add confidence interval
import config_cat_embedding
import pandas as pd
import numpy as np
import random
import time
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
from scipy import stats

from data_prep import bank_data_prep, adult_data_prep
from embedding_helper import create_network
from gensim.models import Word2Vec
# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)


# Load and preprocess data
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

df_bank, cat_cols = adult_data_prep(bank_data)

X = df_bank.iloc[:, :-1]
y = df_bank.y


# Define the classifiers
seed = 42

models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', random_state=seed)),
    # ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(
        model=create_network,
        epochs=100, batch_size=100, verbose=0, random_state=seed))
]

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)  # You can adjust n_splits as needed

# Function to calculate confidence intervals
def confidence_interval(data, confidence=0.95):
    n = len(data)
    m = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * stats.t.ppf((1 + confidence) / 2, n - 1)
    return m, m - h, m + h

# Move the embedding function outside the loop
def get_word2vec_embeddings(df, cat_cols, model, dimpool):
    elements = []
    for _, row in df.iterrows():
        categorical_embeddings = []
        for col in cat_cols:
            try:
                categorical_embeddings.append(model.wv[row[col]])
            except KeyError:
                categorical_embeddings.append(np.zeros((dimpool,)))
        elements.append(np.array(categorical_embeddings))
    reshaped_x = np.reshape(elements, (len(elements), len(cat_cols) * dimpool))
    return reshaped_x

# Main loop over models
for name, classifier in models:
    print(f"Classifier: {name}")
    # Lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    roc_aucs = []
    computation_times = []

    fold = 1
    for train_index, test_index in cv.split(X, y):
        # Split data into training and test sets for this fold
        X_train_fold = X.iloc[train_index].copy()
        X_test_fold = X.iloc[test_index].copy()
        y_train_fold = y.iloc[train_index].reset_index(drop=True)
        y_test_fold = y.iloc[test_index].reset_index(drop=True)

        # Combine categorical columns into a single string for Word2Vec
        X_train_fold['stringcat'] = X_train_fold[cat_cols].apply(lambda x: ' '.join(x), axis=1)

        # Train Word2Vec model on training data
        dimpool = 30  # Embedding dimension
        word2vec_model = Word2Vec(sentences=X_train_fold['stringcat'].str.split(" "), vector_size=dimpool,
                                  window=2, min_count=1, workers=1, seed=42)

        # Generate embeddings for training data
        X_train_emb = get_word2vec_embeddings(X_train_fold, cat_cols, word2vec_model, dimpool)

        # Handle numerical features
        numerical_cols = X_train_fold.select_dtypes(exclude='object').columns.tolist()
        X_train_num = X_train_fold[numerical_cols].reset_index(drop=True)

        # Create DataFrame for embeddings with string column names
        emb_col_names = [f'emb_{i}' for i in range(X_train_emb.shape[1])]
        X_train_emb_df = pd.DataFrame(X_train_emb, columns=emb_col_names)

        # Concatenate numerical features and embeddings
        X_train_combined = pd.concat([X_train_num, X_train_emb_df], axis=1)

        # Ensure all column names are strings
        X_train_combined.columns = X_train_combined.columns.astype(str)

        # Generate embeddings for test data
        X_test_emb = get_word2vec_embeddings(X_test_fold, cat_cols, word2vec_model, dimpool)
        X_test_num = X_test_fold[numerical_cols].reset_index(drop=True)
        X_test_emb_df = pd.DataFrame(X_test_emb, columns=emb_col_names)
        X_test_combined = pd.concat([X_test_num, X_test_emb_df], axis=1)
        X_test_combined.columns = X_test_combined.columns.astype(str)

        # Standard scaling
        stc = StandardScaler()
        X_train_scaled = stc.fit_transform(X_train_combined)
        X_test_scaled = stc.transform(X_test_combined)

        # Update number_of_features for MLP
        number_of_features = X_train_scaled.shape[1]
        if name == 'MLP':
            classifier.set_params(model__number_of_features=number_of_features)

        # Start timing
        start_time = time.time()

        # Fit the model
        classifier.fit(X_train_scaled, y_train_fold)

        # End timing
        end_time = time.time()
        elapsed_time = end_time - start_time
        computation_times.append(elapsed_time)

        # Predict on test data
        y_pred_fold = classifier.predict(X_test_scaled)
        if hasattr(classifier, "predict_proba"):
            y_pred_prob_fold = classifier.predict_proba(X_test_scaled)[:, 1]
        else:
            y_pred_scores = classifier.decision_function(X_test_scaled)
            y_pred_prob_fold = (y_pred_scores - y_pred_scores.min()) / (y_pred_scores.max() - y_pred_scores.min())

        # Collect performance metrics
        accuracies.append(accuracy_score(y_test_fold, y_pred_fold))
        precisions.append(precision_score(y_test_fold, y_pred_fold, zero_division=0))
        recalls.append(recall_score(y_test_fold, y_pred_fold))
        f1s.append(f1_score(y_test_fold, y_pred_fold))
        roc_aucs.append(roc_auc_score(y_test_fold, y_pred_prob_fold))

        fold += 1

    # Calculate mean and confidence intervals
    acc_mean, acc_ci_lower, acc_ci_upper = confidence_interval(accuracies)
    prec_mean, prec_ci_lower, prec_ci_upper = confidence_interval(precisions)
    rec_mean, rec_ci_lower, rec_ci_upper = confidence_interval(recalls)
    f1_mean, f1_ci_lower, f1_ci_upper = confidence_interval(f1s)
    roc_mean, roc_ci_lower, roc_ci_upper = confidence_interval(roc_aucs)
    time_mean = np.mean(computation_times)

    # Print results
    print(f"Accuracy: {acc_mean:.4f} (95% CI: {acc_ci_lower:.4f} - {acc_ci_upper:.4f})")
    print(f"Precision: {prec_mean:.4f} (95% CI: {prec_ci_lower:.4f} - {prec_ci_upper:.4f})")
    print(f"Recall: {rec_mean:.4f} (95% CI: {rec_ci_lower:.4f} - {rec_ci_upper:.4f})")
    print(f"F1 Score: {f1_mean:.4f} (95% CI: {f1_ci_lower:.4f} - {f1_ci_upper:.4f})")
    print(f"ROC AUC: {roc_mean:.4f} (95% CI: {roc_ci_lower:.4f} - {roc_ci_upper:.4f})")
    print(f"Average Computation Time per Fold: {time_mean:.4f} seconds\n")


Classifier: LR
Accuracy: 0.8483 (95% CI: 0.8451 - 0.8516)
Precision: 0.7355 (95% CI: 0.7275 - 0.7435)
Recall: 0.6061 (95% CI: 0.5992 - 0.6129)
F1 Score: 0.6645 (95% CI: 0.6574 - 0.6717)
ROC AUC: 0.9047 (95% CI: 0.8997 - 0.9097)
Average Computation Time per Fold: 2.3147 seconds

Classifier: DT
Accuracy: 0.8404 (95% CI: 0.8372 - 0.8435)
Precision: 0.7695 (95% CI: 0.7576 - 0.7815)
Recall: 0.5081 (95% CI: 0.5037 - 0.5125)
F1 Score: 0.6121 (95% CI: 0.6055 - 0.6187)
ROC AUC: 0.8606 (95% CI: 0.8564 - 0.8648)
Average Computation Time per Fold: 0.5492 seconds

Classifier: RF
Accuracy: 0.8337 (95% CI: 0.8295 - 0.8379)
Precision: 0.7372 (95% CI: 0.7253 - 0.7490)
Recall: 0.5115 (95% CI: 0.5022 - 0.5209)
F1 Score: 0.6039 (95% CI: 0.5937 - 0.6142)
ROC AUC: 0.8905 (95% CI: 0.8864 - 0.8946)
Average Computation Time per Fold: 7.1090 seconds

Classifier: KNN
Accuracy: 0.8140 (95% CI: 0.8082 - 0.8197)
Precision: 0.6383 (95% CI: 0.6274 - 0.6492)
Recall: 0.5753 (95% CI: 0.5560 - 0.5946)
F1 Score: 0.6051 (9

In [1]:
import config_cat_embedding
import numpy as np
import pandas as pd
import random

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,auc, roc_auc_score

from tqdm import tqdm_notebook as tqdm
from data_prep import bank_data_prep,adult_data_prep
from embedding_helper import create_network
# Set the random seed for reproducibility
import time

random.seed(42)
np.random.seed(42)



    



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(X_train.iterrows()):


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(X_test.iterrows()):


0it [00:00, ?it/s]

Classifier: LR
Computation Time: 1.7425615787506104 seconds
[[6283  535]
 [ 898 1329]]
              precision    recall  f1-score   support

           0     0.8749    0.9215    0.8976      6818
           1     0.7130    0.5968    0.6497      2227

    accuracy                         0.8416      9045
   macro avg     0.7940    0.7591    0.7737      9045
weighted avg     0.8351    0.8416    0.8366      9045

0.8996662602216617

Classifier: DT
Computation Time: 0.5335593223571777 seconds
[[6453  365]
 [1089 1138]]
              precision    recall  f1-score   support

           0     0.8556    0.9465    0.8987      6818
           1     0.7572    0.5110    0.6102      2227

    accuracy                         0.8392      9045
   macro avg     0.8064    0.7287    0.7545      9045
weighted avg     0.8314    0.8392    0.8277      9045

0.8572403631107758



In [9]:
import config_cat_embedding
import numpy as np
import pandas as pd
import random

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,auc, roc_auc_score

from tqdm import tqdm_notebook as tqdm
from data_prep import bank_data_prep,adult_data_prep
from embedding_helper import create_network
# Set the random seed for reproducibility
import time
import os
random.seed(42)
np.random.seed(42)
# Set the random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)

os.environ['PYTHONHASHSEED'] = str(seed)

# Load the data and complete the data pre-processing
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path + 'adult.csv', sep=',')

dat_bank, cat_cols = adult_data_prep(bank_data)

X = dat_bank.iloc[:, :-1]
target = dat_bank.y

# Split the data into training and test sets
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, target, test_size=0.2, random_state=1500)

# Get categorical columns and create 'stringcat' for Word2Vec
X_train_full['stringcat'] = X_train_full[cat_cols].apply(lambda x: ' '.join(x), axis=1)

# Train Word2Vec model
dimpool = 30
model = Word2Vec(sentences=X_train_full['stringcat'].str.split(" "), vector_size=dimpool, window=2,
                 min_count=1, workers=1, seed=seed)
model.save("word2vec.model.bank")

# Apply embeddings to training data
elements = []
for _, row in tqdm(X_train_full.iterrows(), total=X_train_full.shape[0]):
    categorical_embeddings = []
    for col in cat_cols:
        try:
            categorical_embeddings.append(model.wv[row[col]])
        except KeyError:
            categorical_embeddings.append(np.zeros(dimpool))
    elements.append(np.concatenate(categorical_embeddings))

reshaped_x = np.array(elements)

# Get the numerical columns
numerical_cols = X_train_full.select_dtypes(exclude='object').columns
my_data = pd.concat([X_train_full[numerical_cols].reset_index(drop=True), pd.DataFrame(reshaped_x)], axis=1)
# Reset the index of y_train
y_train = y_train_full.reset_index(drop=True)

# Apply embeddings to test data
elements_test = []
for _, row in tqdm(X_test_full.iterrows(), total=X_test_full.shape[0]):
    categorical_embeddings = []
    for col in cat_cols:
        try:
            categorical_embeddings.append(model.wv[row[col]])
        except KeyError:
            categorical_embeddings.append(np.zeros(dimpool))
    elements_test.append(np.concatenate(categorical_embeddings))

reshaped_x_test = np.array(elements_test)
my_test_data = pd.concat([X_test_full[numerical_cols].reset_index(drop=True), pd.DataFrame(reshaped_x_test)], axis=1)
y_test = y_test_full.reset_index(drop=True)

# Split my_data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(my_data, y_train, test_size=0.2, random_state=1500)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(my_test_data.values)


# Define the models
models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=seed)),
    ('MLP', KerasClassifier(build_fn=lambda: create_network(X_train_scaled.shape[1]),
                            epochs=100, batch_size=100, verbose=0, random_state=seed)),
    ('SVM', SVC(gamma='scale', random_state=seed, probability=True))
]

# Define number of runs for cross-validation
n_runs = 5
kf = StratifiedKFold(n_splits=n_runs, shuffle=True, random_state=seed)

# Initialize list to store results
results = []

# Loop over the models
for name, model in models:
    print(f"Classifier: {name}")
    start_time_total = time.time()

    # Lists to store metrics for each fold
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_auc_scores = []
    computation_times = []

    # Perform cross-validation
    for train_idx, val_idx in kf.split(X_train_scaled, y_train):
        # Clone the model to ensure fresh training for each fold
        if name == 'MLP':
            # Re-instantiate KerasClassifier to reset weights
            clf = KerasClassifier(build_fn=lambda: create_network(X_train_scaled.shape[1]),
                                  epochs=100, batch_size=100, verbose=0, random_state=seed)
        else:
            clf = clone(model)

        X_fold_train, X_fold_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        start_time = time.time()
        clf.fit(X_fold_train, y_fold_train)
        end_time = time.time()
        computation_time = end_time - start_time

        y_pred = clf.predict(X_fold_val)

        # Handle probability predictions
        if hasattr(clf, "predict_proba"):
            y_pred_prob = clf.predict_proba(X_fold_val)[:, 1]
        elif hasattr(clf, "decision_function"):
            y_scores = clf.decision_function(X_fold_val)
            from scipy.special import expit
            y_pred_prob = expit(y_scores)
        else:
            y_pred_prob = y_pred

        # Append metrics for this fold
        accuracy_scores.append(accuracy_score(y_fold_val, y_pred))
        precision_scores.append(precision_score(y_fold_val, y_pred, zero_division=0))
        recall_scores.append(recall_score(y_fold_val, y_pred, zero_division=0))
        f1_scores.append(f1_score(y_fold_val, y_pred, zero_division=0))
        roc_auc_scores.append(roc_auc_score(y_fold_val, y_pred_prob))
        computation_times.append(computation_time)

    # Compute mean and standard deviation of all metrics
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)

    mean_precision = np.mean(precision_scores)
    std_precision = np.std(precision_scores)

    mean_recall = np.mean(recall_scores)
    std_recall = np.std(recall_scores)

    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)

    mean_roc_auc = np.mean(roc_auc_scores)
    std_roc_auc = np.std(roc_auc_scores)

    mean_time = np.mean(computation_times)
    std_time = np.std(computation_times)

    end_time_total = time.time()
    total_time = end_time_total - start_time_total

    # Store results for the current model
    results.append({
        "Classifier": name,
        "Mean Accuracy": mean_accuracy, "STD Accuracy": std_accuracy,
        "Mean Precision": mean_precision, "STD Precision": std_precision,
        "Mean Recall": mean_recall, "STD Recall": std_recall,
        "Mean F1 Score": mean_f1, "STD F1 Score": std_f1,
        "Mean ROC AUC": mean_roc_auc, "STD ROC AUC": std_roc_auc,
        "Mean Computation Time": mean_time, "STD Computation Time": std_time,
        "Total Computation Time": total_time
    })

    # Print results
    print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"Mean Precision: {mean_precision:.4f} ± {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f} ± {std_recall:.4f}")
    print(f"Mean F1 Score: {mean_f1:.4f} ± {std_f1:.4f}")
    print(f"Mean ROC AUC: {mean_roc_auc:.4f} ± {std_roc_auc:.4f}")
    print(f"Mean Computation Time per Fold: {mean_time:.4f} ± {std_time:.4f} seconds")
    print(f"Total Computation Time: {total_time:.4f} seconds")
    print()

# Create a DataFrame to display all results
results_df = pd.DataFrame(results)
print(results_df)

# Final evaluation on the test set
print("Final Evaluation on Test Set:")
for name, model in models:
    print(f"Classifier: {name}")
    start_time = time.time()
    if name == 'MLP':
        model = KerasClassifier(build_fn=lambda: create_network(X_train_scaled.shape[1]),
                                epochs=100, batch_size=100, verbose=0, random_state=seed)
    else:
        model = clone(model)

    # Combine X_train_scaled and X_val_scaled for final training
    X_combined = np.vstack((X_train_scaled, X_val_scaled))
    y_combined = pd.concat([y_train.reset_index(drop=True), y_val.reset_index(drop=True)], ignore_index=True)
    model.fit(X_combined, y_combined)
    end_time = time.time()
    total_time = end_time - start_time

    y_pred_test = model.predict(X_test_scaled)

    # Handle probability predictions
    if hasattr(model, "predict_proba"):
        y_pred_prob_test = model.predict_proba(X_test_scaled)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test_scaled)
        from scipy.special import expit
        y_pred_prob_test = expit(y_scores)
    else:
        y_pred_prob_test = y_pred_test

    print(confusion_matrix(y_test, y_pred_test))
    print(classification_report(y_test, y_pred_test, digits=4))
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_prob_test):.4f}")
    print(f"Total Computation Time: {total_time:.4f} seconds")
    print()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _, row in tqdm(X_train_full.iterrows(), total=X_train_full.shape[0]):


  0%|          | 0/36177 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _, row in tqdm(X_test_full.iterrows(), total=X_test_full.shape[0]):


  0%|          | 0/9045 [00:00<?, ?it/s]

Classifier: LR
Mean Accuracy: 0.8509 ± 0.0035
Mean Precision: 0.7393 ± 0.0100
Mean Recall: 0.6150 ± 0.0076
Mean F1 Score: 0.6714 ± 0.0074
Mean ROC AUC: 0.9060 ± 0.0050
Mean Computation Time per Fold: 1.5066 ± 0.1590 seconds
Total Computation Time: 7.9106 seconds

Classifier: DT
Mean Accuracy: 0.8416 ± 0.0039
Mean Precision: 0.7742 ± 0.0140
Mean Recall: 0.5087 ± 0.0070
Mean F1 Score: 0.6140 ± 0.0087
Mean ROC AUC: 0.8632 ± 0.0039
Mean Computation Time per Fold: 0.4020 ± 0.0142 seconds
Total Computation Time: 2.3698 seconds

Classifier: RF
Mean Accuracy: 0.8340 ± 0.0039
Mean Precision: 0.7352 ± 0.0118
Mean Recall: 0.5156 ± 0.0085
Mean F1 Score: 0.6061 ± 0.0094
Mean ROC AUC: 0.8920 ± 0.0051
Mean Computation Time per Fold: 5.2733 ± 0.2306 seconds
Total Computation Time: 27.7126 seconds

Classifier: KNN
Mean Accuracy: 0.8149 ± 0.0054
Mean Precision: 0.6405 ± 0.0143
Mean Recall: 0.5761 ± 0.0106
Mean F1 Score: 0.6065 ± 0.0100
Mean ROC AUC: 0.8120 ± 0.0046
Mean Computation Time per Fold: 0.0107

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean Accuracy: 0.8671 ± 0.0038
Mean Precision: 0.7695 ± 0.0099
Mean Recall: 0.6616 ± 0.0093
Mean F1 Score: 0.7115 ± 0.0084
Mean ROC AUC: 0.9241 ± 0.0040
Mean Computation Time per Fold: 1.1387 ± 0.0419 seconds
Total Computation Time: 6.3224 seconds

Classifier: MLP


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Mean Accuracy: 0.8350 ± 0.0025
Mean Precision: 0.7120 ± 0.0177
Mean Recall: 0.5627 ± 0.0238
Mean F1 Score: 0.6280 ± 0.0098
Mean ROC AUC: 0.8877 ± 0.0037
Mean Computation Time per Fold: 31.1877 ± 0.9844 seconds
Total Computation Time: 158.3211 seconds

Classifier: SVM
Mean Accuracy: 0.8439 ± 0.0038
Mean Precision: 0.7441 ± 0.0087
Mean Recall: 0.5631 ± 0.0131
Mean F1 Score: 0.6410 ± 0.0105
Mean ROC AUC: 0.8875 ± 0.0045
Mean Computation Time per Fold: 5950.3404 ± 10616.7794 seconds
Total Computation Time: 30068.8079 seconds

  Classifier  Mean Accuracy  STD Accuracy  Mean Precision  STD Precision  \
0         LR       0.850938      0.003518        0.739301       0.009968   
1         DT       0.841574      0.003854        0.774217       0.013957   
2         RF       0.834042      0.003913        0.735209       0.011781   
3        KNN       0.814865      0.005423        0.640486       0.014314   
4        XGB       0.867109      0.003837        0.769482       0.009863   
5        MLP    

Parameters: { "use_label_encoder" } are not used.



[[6359  459]
 [ 732 1495]]
              precision    recall  f1-score   support

           0     0.8968    0.9327    0.9144      6818
           1     0.7651    0.6713    0.7151      2227

    accuracy                         0.8683      9045
   macro avg     0.8309    0.8020    0.8148      9045
weighted avg     0.8644    0.8683    0.8653      9045

ROC AUC Score: 0.9253
Total Computation Time: 1.8989 seconds

Classifier: MLP


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[[6171  647]
 [ 786 1441]]
              precision    recall  f1-score   support

           0     0.8870    0.9051    0.8960      6818
           1     0.6901    0.6471    0.6679      2227

    accuracy                         0.8416      9045
   macro avg     0.7886    0.7761    0.7819      9045
weighted avg     0.8385    0.8416    0.8398      9045

ROC AUC Score: 0.8898
Total Computation Time: 137.1079 seconds

Classifier: SVM
[[6336  482]
 [ 954 1273]]
              precision    recall  f1-score   support

           0     0.8691    0.9293    0.8982      6818
           1     0.7254    0.5716    0.6394      2227

    accuracy                         0.8412      9045
   macro avg     0.7972    0.7505    0.7688      9045
weighted avg     0.8337    0.8412    0.8345      9045

ROC AUC Score: 0.8870
Total Computation Time: 79273.3585 seconds



In [3]:
import config_cat_embedding
import numpy as np
import pandas as pd
import random

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,auc, roc_auc_score

from tqdm import tqdm_notebook as tqdm
from data_prep import bank_data_prep,adult_data_prep
from embedding_helper import create_network
# Set the random seed for reproducibility
import time

random.seed(42)
np.random.seed(42)


#%% load the data and completed the data pre-processing
data_path = config_cat_embedding.paths['data']
data_path_out = config_cat_embedding.paths['data_output']
bank_data = pd.read_csv(data_path+'adult.csv', sep=',')

dat_bank, cat_cols = adult_data_prep(bank_data)

X = dat_bank.iloc[:, :-1]
target = dat_bank.y

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=1500)

#Get categorical columns
X_train['stringcat'] = X_train[cat_cols].apply(lambda x: ' '.join(x), axis=1)

#train word2vec model
dimpool = 30
model = Word2Vec(sentences=X_train['stringcat'].str.split(" "), vector_size=dimpool, window=2, min_count=1, workers=1,seed=42)
model.save("word2vec.model.bank")


#model.wv['entrepreneur']
elements = []
for row in tqdm(X_train.iterrows()):
    categorical_embeddings = []
    for i in cat_cols:
        #print(i,row[1][i])
        try:
            
            categorical_embeddings.append(model.wv[row[1][i]])
        except:
            categorical_embeddings.append(np.zeros((dimpool)))
    elements.append(np.array(categorical_embeddings))
#elements 
reshaped_x = (np.reshape(elements,(len(elements),len(cat_cols)*dimpool)))   
# Get the numerical columns
numerical_cols = np.where(X_train.dtypes!="object")[0]
my_data = pd.concat([X_train.iloc[:, numerical_cols].reset_index(drop=True), pd.DataFrame(reshaped_x)], axis=1)
# due to the new index of my_data, we have to change the index of y_train
y_train = y_train.reset_index(drop=True)



#%% apply it on the X_test dataset 
elements_test = []
for row in tqdm(X_test.iterrows()):
    categorical_embeddings = []
    for i in cat_cols:
        #print(i,row[1][i])
        try:
            categorical_embeddings.append(model.wv[row[1][i]])
        except:
            categorical_embeddings.append(np.zeros((dimpool)))
    elements_test .append(np.array(categorical_embeddings))

#elements 
reshaped_x_test = (np.reshape(elements_test ,(len(elements_test ),len(cat_cols)*dimpool)))   
# Get the numerical columns
my_test_data = pd.concat([X_test.iloc[:, numerical_cols].reset_index(drop=True), pd.DataFrame(reshaped_x_test)], axis=1)
# due to the new index of my_data, we have to change the index of y_train
y_test = y_test.reset_index(drop=True)


X_train2, X_test2, y_train2, y_test2 = train_test_split(my_data,y_train,test_size=0.2, random_state=1500)
stc = StandardScaler()
X_scaled = stc.fit_transform(X_train2.values)


seed=42
models = [
    ('LR', LogisticRegression(solver='lbfgs', random_state=seed, max_iter=1000)),
    ('DT', DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=seed)),
    ('RF', RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed, min_samples_leaf=3)),
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('XGB', XGBClassifier(eval_metric='logloss')),
    ('SVM', SVC(gamma='scale', random_state=seed, probability=True)),
    ('MLP', KerasClassifier(build_fn=create_network, number_of_features=my_data.shape[1], epochs=100, batch_size=100, verbose=0))
]

for name, classifier in models:
    start_time = time.time()
    classifier.fit(X_scaled, y_train2)
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Classifier: {name}")
    print(f"Computation Time: {elapsed_time} seconds")      
#     y_pred = classifier.predict(stc.transform(X_test2.values))
#     y_pred_prob = classifier.predict_proba(stc.transform(X_test2.values))

#     print(confusion_matrix(y_test2,y_pred))
#     print(classification_report(y_test2,y_pred, digits=4))
#     print(roc_auc_score(y_test2,y_pred_prob[:,1]))
    
    y_pred_test = classifier.predict(stc.transform(my_test_data.values))
    y_pred_prob_test = classifier.predict_proba(stc.transform(my_test_data.values))

    print(confusion_matrix(y_test,y_pred_test))
    print(classification_report(y_test,y_pred_test, digits=4))

    print(roc_auc_score(y_test,y_pred_prob_test[:,1]))
    print()
    



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(X_train.iterrows()):


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for row in tqdm(X_test.iterrows()):


0it [00:00, ?it/s]

  ('MLP', KerasClassifier(build_fn=create_network, number_of_features=my_data.shape[1], epochs=100, batch_size=100, verbose=0))


Classifier: LR
Computation Time: 3.842664957046509 seconds
[[6278  540]
 [ 896 1331]]
              precision    recall  f1-score   support

           0     0.8751    0.9208    0.8974      6818
           1     0.7114    0.5977    0.6496      2227

    accuracy                         0.8412      9045
   macro avg     0.7932    0.7592    0.7735      9045
weighted avg     0.8348    0.8412    0.8364      9045

0.8996557225959495

Classifier: DT
Computation Time: 0.20313644409179688 seconds
[[6453  365]
 [1089 1138]]
              precision    recall  f1-score   support

           0     0.8556    0.9465    0.8987      6818
           1     0.7572    0.5110    0.6102      2227

    accuracy                         0.8392      9045
   macro avg     0.8064    0.7287    0.7545      9045
weighted avg     0.8314    0.8392    0.8277      9045

0.8572403631107758

Classifier: RF
Computation Time: 4.99088191986084 seconds
[[6391  427]
 [1076 1151]]
              precision    recall  f1-score   s