# Battle of Classifiers: Comparative Study of Different Classifier Models

<p style="text-align:center;">Junghwan Kim</p>
<p style="text-align:center;">12/21/2024</p>
<p style="text-align:center;">junghwk11@gmail.com</p>


## Library Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from sklearn.decomposition import PCA
import os
import numpy as np
import kagglehub

## Data Imports

In [2]:
mushroom = kagglehub.dataset_download("uciml/mushroom-classification")
wine_quality = kagglehub.dataset_download("uciml/red-wine-quality-cortez-et-al-2009")
income = kagglehub.dataset_download("uciml/adult-census-income")




## SVM Evaluation/Cross Validation Functions

In [17]:
def SVM_evaluate_with_train_test_split(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    model = SVC(kernel='linear', C=1.0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Train-Test Split: {1 - test_size:.0%}/{test_size:.0%}")
    print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

def SVM_evaluate_with_kfold(X_train, y_train, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_accuracies = []
    val_accuracies = []
    if not isinstance(y_train, np.ndarray):
        y_train = np.array(y_train)

    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        model = SVC(kernel='linear', C=1.0)
        model.fit(X_train_fold, y_train_fold)

        train_accuracy = model.score(X_train_fold, y_train_fold)
        val_accuracy = model.score(X_val_fold, y_val_fold)

        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

    mean_train_accuracy = sum(train_accuracies) / len(train_accuracies)
    mean_val_accuracy = sum(val_accuracies) / len(val_accuracies)

    print(f"K-Fold Cross-Validation Training Accuracy: {mean_train_accuracy * 100:.2f}%")
    print(f"K-Fold Cross-Validation Validation Accuracy: {mean_val_accuracy * 100:.2f}%")

## Boosted Trees Evaluation/Cross Validation Functions

In [4]:
def BT_evaluate_with_train_test_split(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Train-Test Split: {1 - test_size:.0%}/{test_size:.0%}")
    print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

def BT_evaluate_with_kfold(X_train, y_train, n_splits=5):
    if isinstance(y_train, pd.Series):
        y_train = y_train.to_numpy()
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    train_accuracies = []
    val_accuracies = []

    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        model = XGBClassifier(eval_metric='logloss')
        model.fit(X_train_fold, y_train_fold)
        train_accuracy = model.score(X_train_fold, y_train_fold)
        val_accuracy = model.score(X_val_fold, y_val_fold)

        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)
    mean_train_accuracy = np.mean(train_accuracies)
    mean_val_accuracy = np.mean(val_accuracies)

    print(f"K-Fold Cross-Validation Training Accuracy: {mean_train_accuracy * 100:.2f}%")
    print(f"K-Fold Cross-Validation Validation Accuracy: {mean_val_accuracy * 100:.2f}%")

## Neural Nets Evaluation/Cross Validation Functions

In [5]:
def build_neural_network(input_dim, output_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model
def NN_evaluate_with_train_test_split(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model = build_neural_network(input_dim=X_train_scaled.shape[1], output_dim=y.shape[1])
    model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)
    _, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)

    print(f"Train-Test Split: {1 - test_size:.0%}/{test_size:.0%}")
    print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

def NN_evaluate_with_kfold(X_train, y_train, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    train_accuracies = []
    val_accuracies = []

    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        scaler = StandardScaler()
        X_train_fold_scaled = scaler.fit_transform(X_train_fold)
        X_val_fold_scaled = scaler.transform(X_val_fold)

        model = build_neural_network(input_dim=X_train_fold_scaled.shape[1], output_dim=y_train.shape[1])
        model.fit(X_train_fold_scaled, y_train_fold, epochs=10, batch_size=32, verbose=0)

        train_accuracy = model.evaluate(X_train_fold_scaled, y_train_fold, verbose=0)[1]
        val_accuracy = model.evaluate(X_val_fold_scaled, y_val_fold, verbose=0)[1]

        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

    mean_train_accuracy = np.mean(train_accuracies)
    mean_val_accuracy = np.mean(val_accuracies)

    print(f"K-Fold Cross-Validation Mean Training Accuracy: {mean_train_accuracy * 100:.2f}%")
    print(f"K-Fold Cross-Validation Mean Validation Accuracy: {mean_val_accuracy * 100:.2f}%")

## Data #1: Mushroom Dataset

### Data Loading and Preprocessing for SVM and Boosted Trees

In [21]:
def mushroom_load_and_preprocess_data(directory_path):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)
    if 'stalk-root' in data.columns:
        data['stalk-root'] = data['stalk-root'].replace('?', 'u')
    if 'veil-type' in data.columns:
        data = data.drop('veil-type', axis=1)
    X = data.drop('class', axis=1)
    y = data['class']
    encoder = OneHotEncoder()
    X_encoded = encoder.fit_transform(X).toarray()
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    return X_encoded, y_encoded

### SVM Execution

In [22]:
X_encoded, y_encoded = mushroom_load_and_preprocess_data(mushroom)
SVM_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.2)
SVM_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.5)
SVM_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.8)
X_train_80_20, _, y_train_80_20, _ = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=24)
SVM_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_encoded, y_encoded, test_size=0.5, random_state=64)
SVM_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_encoded, y_encoded, test_size=0.8, random_state=4)
SVM_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['mushrooms.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 100.00%
Train-Test Split: 50%/50%
Testing Accuracy: 100.00%
Train-Test Split: 20%/80%
Testing Accuracy: 99.88%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 100.00%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 100.00%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 99.94%


### Boosted Trees Execution

In [50]:
X_encoded, y_encoded = mushroom_load_and_preprocess_data(mushroom)
BT_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.2)
BT_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.5)
BT_evaluate_with_train_test_split(X_encoded, y_encoded, test_size=0.8)
X_train_80_20, _, y_train_80_20, _ = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=1)
BT_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_encoded, y_encoded, test_size=0.5, random_state=2)
BT_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_encoded, y_encoded, test_size=0.8, random_state=4)
BT_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['mushrooms.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 100.00%
Train-Test Split: 50%/50%
Testing Accuracy: 100.00%
Train-Test Split: 20%/80%
Testing Accuracy: 100.00%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 100.00%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 100.00%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 99.94%


### Neural Nets Excecution

In [51]:
def mushroom_nn_load_and_preprocess_data(directory_path):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)
    if 'stalk-root' in data.columns:
        data['stalk-root'] = data['stalk-root'].replace('?', 'u')
    if 'veil-type' in data.columns:
        data = data.drop('veil-type', axis=1)
    X = data.drop('class', axis=1)
    y = data['class']
    encoder = OneHotEncoder()
    X_encoded = encoder.fit_transform(X).toarray()
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_one_hot = to_categorical(y_encoded)
    return X_encoded, y_one_hot

X_final, y_one_hot = mushroom_nn_load_and_preprocess_data(mushroom)
NN_evaluate_with_train_test_split(X_encoded, y_one_hot, test_size=0.2)
NN_evaluate_with_train_test_split(X_encoded, y_one_hot, test_size=0.5)
NN_evaluate_with_train_test_split(X_encoded, y_one_hot, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_encoded, y_one_hot, test_size=0.2, random_state=43)
NN_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_encoded, y_one_hot, test_size=0.5, random_state=756)
NN_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_encoded, y_one_hot, test_size=0.8, random_state=543)
NN_evaluate_with_kfold(X_train_20_80, y_train_20_80)

Files in directory: ['mushrooms.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 100.00%
Train-Test Split: 50%/50%
Testing Accuracy: 100.00%
Train-Test Split: 20%/80%
Testing Accuracy: 99.86%
K-Fold Cross-Validation Mean Training Accuracy: 100.00%
K-Fold Cross-Validation Mean Validation Accuracy: 100.00%
K-Fold Cross-Validation Mean Training Accuracy: 100.00%
K-Fold Cross-Validation Mean Validation Accuracy: 100.00%
K-Fold Cross-Validation Mean Training Accuracy: 100.00%
K-Fold Cross-Validation Mean Validation Accuracy: 99.94%


For Mushroom sata, the result corresponds with the baseline model performance given by the UCI Machine Learning Repository. Though it might seem concerning to have near 100% accuracy, baseline model performance shows that it is the nature of the data rather than overfitting in the model. 

## Data #2: Wine Dataset

### Data Loading and Preprocessing for SVM and Boosted Trees

In [52]:
def wine_load_and_preprocess_data(directory_path, target_column='quality', relabel_target=True, scale_features=True):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    if relabel_target:
        y = y - y.min()
    if scale_features:
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(X)
    else:
        X_processed = X.values

    return X_processed, y

 ### SVM Execution

In [53]:
X_scaled, y = wine_load_and_preprocess_data(wine_quality)
SVM_evaluate_with_train_test_split(X_scaled, y, test_size=0.2)
SVM_evaluate_with_train_test_split(X_scaled, y, test_size=0.5)
SVM_evaluate_with_train_test_split(X_scaled, y, test_size=0.8)
X_train_80_20, _, y_train_80_20, _ = train_test_split(X_scaled, y, test_size=0.2, random_state=1)
SVM_evaluate_with_kfold(X_train_80_20, y_train_80_20)

X_train_50_50, _, y_train_50_50, _ = train_test_split(X_scaled, y, test_size=0.5, random_state=89)
SVM_evaluate_with_kfold(X_train_50_50, y_train_50_50)

X_train_20_80, _, y_train_20_80, _ = train_test_split(X_scaled, y, test_size=0.8, random_state=58)
SVM_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['winequality-red.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 55.94%
Train-Test Split: 50%/50%
Testing Accuracy: 56.12%
Train-Test Split: 20%/80%
Testing Accuracy: 55.16%
K-Fold Cross-Validation Training Accuracy: 59.89%
K-Fold Cross-Validation Validation Accuracy: 58.48%
K-Fold Cross-Validation Training Accuracy: 60.73%
K-Fold Cross-Validation Validation Accuracy: 58.70%
K-Fold Cross-Validation Training Accuracy: 62.14%
K-Fold Cross-Validation Validation Accuracy: 51.71%


### Boosted Trees Execution

In [54]:
X_scaled, y = wine_load_and_preprocess_data(wine_quality)
BT_evaluate_with_train_test_split(X_scaled, y, test_size=0.2)
BT_evaluate_with_train_test_split(X_scaled, y, test_size=0.5)
BT_evaluate_with_train_test_split(X_scaled, y, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_scaled, y, test_size=0.2, random_state=54)
BT_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_scaled, y, test_size=0.5, random_state=55)
BT_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_scaled, y, test_size=0.8, random_state=56)
BT_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['winequality-red.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 69.69%
Train-Test Split: 50%/50%
Testing Accuracy: 62.88%
Train-Test Split: 20%/80%
Testing Accuracy: 57.11%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 64.35%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 61.46%
K-Fold Cross-Validation Training Accuracy: 100.00%
K-Fold Cross-Validation Validation Accuracy: 58.29%


### Neural Nets Execution

In [55]:
def wine_nn_load_and_preprocess_data(directory_path):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)
    X = data.drop('quality', axis=1)
    y = data['quality']
    y = y - y.min()
    y_one_hot = to_categorical(y)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y_one_hot

X_final_wine_quality, y_one_hot_wine_quality = wine_nn_load_and_preprocess_data(wine_quality)

NN_evaluate_with_train_test_split(X_final_wine_quality, y_one_hot_wine_quality, test_size=0.2)
NN_evaluate_with_train_test_split(X_final_wine_quality, y_one_hot_wine_quality, test_size=0.5)
NN_evaluate_with_train_test_split(X_final_wine_quality, y_one_hot_wine_quality, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_final_wine_quality, y_one_hot_wine_quality,test_size=0.2, random_state=43)
NN_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_final_wine_quality, y_one_hot_wine_quality,test_size=0.5, random_state=1)
NN_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_final_wine_quality,y_one_hot_wine_quality,test_size=0.8 , random_state=4)
NN_evaluate_with_kfold(X_train_20_80,y_train_20_80)


Files in directory: ['winequality-red.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 58.13%
Train-Test Split: 50%/50%
Testing Accuracy: 56.75%
Train-Test Split: 20%/80%
Testing Accuracy: 55.94%
K-Fold Cross-Validation Mean Training Accuracy: 62.61%
K-Fold Cross-Validation Mean Validation Accuracy: 58.09%
K-Fold Cross-Validation Mean Training Accuracy: 64.52%
K-Fold Cross-Validation Mean Validation Accuracy: 60.70%
K-Fold Cross-Validation Mean Training Accuracy: 64.03%
K-Fold Cross-Validation Mean Validation Accuracy: 57.36%


## Data #3: Adult Census Income Dataset

### Data Loading and Preprocessing for SVM and Boosted Trees

In [56]:
def ACI_load_and_preprocess_data(directory_path, n_components=None):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)
    data.replace(' ?', pd.NA, inplace=True)
    data.dropna(inplace=True)
    X = data.drop('income', axis=1)
    y = data['income']
    X_encoded = pd.get_dummies(X, drop_first=True)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)

    if n_components is not None:
        pca = PCA(n_components=n_components)
        X_reduced = pca.fit_transform(X_scaled)
    else:
        X_reduced = X_scaled

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    return X_reduced, y_encoded

### SVM Execution

In [57]:
X_final, y_encoded = ACI_load_and_preprocess_data(income)

SVM_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.2)
SVM_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.5)
SVM_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_final, y_encoded, test_size=0.2, random_state=23)
SVM_evaluate_with_kfold(X_train_80_20, y_train_80_20)

X_train_50_50, _, y_train_50_50, _ = train_test_split(X_final, y_encoded, test_size=0.5, random_state=60)
SVM_evaluate_with_kfold(X_train_50_50, y_train_50_50)

X_train_20_80, _, y_train_20_80, _ = train_test_split(X_final, y_encoded, test_size=0.8, random_state=71)
SVM_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['adult.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 85.03%
Train-Test Split: 50%/50%
Testing Accuracy: 84.82%
Train-Test Split: 20%/80%
Testing Accuracy: 84.46%
K-Fold Cross-Validation Training Accuracy: 85.22%
K-Fold Cross-Validation Validation Accuracy: 85.02%
K-Fold Cross-Validation Training Accuracy: 85.57%
K-Fold Cross-Validation Validation Accuracy: 85.25%
K-Fold Cross-Validation Training Accuracy: 85.77%
K-Fold Cross-Validation Validation Accuracy: 85.07%


### Boosted Trees Execution

In [58]:
X_final, y_encoded = ACI_load_and_preprocess_data(income)

BT_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.2)
BT_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.5)
BT_evaluate_with_train_test_split(X_final, y_encoded, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_final, y_encoded, test_size=0.2, random_state=12)
BT_evaluate_with_kfold(X_train_80_20, y_train_80_20)

X_train_50_50, _, y_train_50_50, _ = train_test_split(X_final, y_encoded, test_size=0.5, random_state=24)
BT_evaluate_with_kfold(X_train_50_50, y_train_50_50)

X_train_20_80, _, y_train_20_80, _ = train_test_split(X_final, y_encoded, test_size=0.8, random_state=46)
BT_evaluate_with_kfold(X_train_20_80, y_train_20_80)

Files in directory: ['adult.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 87.21%
Train-Test Split: 50%/50%
Testing Accuracy: 86.51%
Train-Test Split: 20%/80%
Testing Accuracy: 85.87%
K-Fold Cross-Validation Training Accuracy: 90.96%
K-Fold Cross-Validation Validation Accuracy: 86.90%
K-Fold Cross-Validation Training Accuracy: 92.24%
K-Fold Cross-Validation Validation Accuracy: 86.76%
K-Fold Cross-Validation Training Accuracy: 95.28%
K-Fold Cross-Validation Validation Accuracy: 85.50%


### Neural Nets Execution

In [59]:
def ACI_nn_load_and_preprocess_data(directory_path):
    files = os.listdir(directory_path)
    print("Files in directory:", files)
    csv_file = [f for f in files if f.endswith('.csv')][0]
    file_path = os.path.join(directory_path, csv_file)
    data = pd.read_csv(file_path)

    data.replace(' ?', pd.NA, inplace=True)
    data.dropna(inplace=True)

    X = data.drop('income', axis=1)
    y = data['income']

    categorical_features = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder(sparse_output=False)
    X_encoded = encoder.fit_transform(X[categorical_features])

    numerical_features = X.select_dtypes(exclude=['object']).values
    X_final = np.hstack((numerical_features, X_encoded))

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_one_hot = to_categorical(y_encoded)

    return X_final, y_one_hot

X_final, y_one_hot = ACI_nn_load_and_preprocess_data(income)

NN_evaluate_with_train_test_split(X_final, y_one_hot, test_size=0.2)
NN_evaluate_with_train_test_split(X_final, y_one_hot, test_size=0.5)
NN_evaluate_with_train_test_split(X_final, y_one_hot, test_size=0.8)

X_train_80_20, _, y_train_80_20, _ = train_test_split(X_final, y_one_hot, test_size=0.2, random_state=8)
NN_evaluate_with_kfold(X_train_80_20, y_train_80_20)
X_train_50_50, _, y_train_50_50, _ = train_test_split(X_final, y_one_hot, test_size=0.5, random_state=88)
NN_evaluate_with_kfold(X_train_50_50, y_train_50_50)
X_train_20_80, _, y_train_20_80, _ = train_test_split(X_final, y_one_hot, test_size=0.8, random_state=888)
NN_evaluate_with_kfold(X_train_20_80, y_train_20_80)


Files in directory: ['adult.csv']
Train-Test Split: 80%/20%
Testing Accuracy: 84.68%
Train-Test Split: 50%/50%
Testing Accuracy: 84.33%
Train-Test Split: 20%/80%
Testing Accuracy: 83.89%
K-Fold Cross-Validation Mean Training Accuracy: 87.83%
K-Fold Cross-Validation Mean Validation Accuracy: 84.56%
K-Fold Cross-Validation Mean Training Accuracy: 88.26%
K-Fold Cross-Validation Mean Validation Accuracy: 84.69%
K-Fold Cross-Validation Mean Training Accuracy: 90.29%
K-Fold Cross-Validation Mean Validation Accuracy: 84.23%
