In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import MinMaxScaler
import requests
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

2023-04-23 16:49:53.037423: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def run_classifiers(X, y, output=None, n_splits=10, random_state=42, epochs=50, batch_size=16, learning_rate=0.001):
    if output == None:
        output = len(np.unique(y))
        
    # Split the dataset into training and testing sets
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # KFold
    #kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    mse_list_knn = []
    mse_list_neural_network = []
    mse_list_extra_trees = []
    accuracy_list_knn = []
    accuracy_list_neural_network = []
    accuracy_list_extra_trees = []

    # Cross-validation on the training set
    for train_index, val_index in kf.split(X_train_all, y_train_all):
        X_train, X_val = X_train_all[train_index], X_train_all[val_index]
        y_train, y_val = y_train_all[train_index], y_train_all[val_index]

        # KNN
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)
        y_pred_knn = knn.predict(X_val)
        mse_knn = mean_squared_error(y_val, y_pred_knn)
        mse_list_knn.append(mse_knn)
        accuracy_knn = accuracy_score(y_val, y_pred_knn)
        accuracy_list_knn.append(accuracy_knn)

        # Neural Networks
        model = Sequential()
        model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(16, activation='relu'))
        #model.add(Dense(len(np.unique(y)), activation='softmax'))
        #model.add(Dense(len(np.unique(y_train)), activation='softmax'))
        model.add(Dense(output, activation='softmax'))

        optimizer = Adam(learning_rate=learning_rate)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        y_train_categorical = to_categorical(y_train)
        y_val_categorical = to_categorical(y_val)
        model.fit(X_train, y_train_categorical, epochs=epochs, batch_size=batch_size, verbose=0)

        y_pred_val_neural_network = np.argmax(model.predict(X_val), axis=1)
        mse_neural_network = mean_squared_error(y_val, y_pred_val_neural_network)
        mse_list_neural_network.append(mse_neural_network)
        accuracy_neural_network = accuracy_score(y_val, y_pred_val_neural_network)
        accuracy_list_neural_network.append(accuracy_neural_network)

        # ExtraTreesClassifier
        extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=random_state)
        extra_trees.fit(X_train, y_train)
        y_pred_extra_trees = extra_trees.predict(X_val)
        mse_extra_trees = mean_squared_error(y_val, y_pred_extra_trees)
        mse_list_extra_trees.append(mse_extra_trees)
        accuracy_extra_trees = accuracy_score(y_val, y_pred_extra_trees)
        accuracy_list_extra_trees.append(accuracy_extra_trees)


    y_pred_knn_test = knn.predict(X_test_all)
    test_error_knn = 1 - accuracy_score(y_test_all, y_pred_knn_test)
    test_mse_knn = mean_squared_error(y_test_all, y_pred_knn_test)


    y_pred_test_neural_network = np.argmax(model.predict(X_test_all), axis=1)
    test_error_neural_network = 1 - accuracy_score(y_test_all, y_pred_test_neural_network)
    test_mse_neural_network = mean_squared_error(y_test_all, y_pred_test_neural_network)


    y_pred_extra_trees_test = extra_trees.predict(X_test_all)
    test_error_extra_trees = 1 - accuracy_score(y_test_all, y_pred_extra_trees_test)
    test_mse_extra_trees = mean_squared_error(y_test_all, y_pred_extra_trees_test)


    return {
        'knn': {
            'training_mse': np.mean(mse_list_knn),
            'training_avg_error': 1 - np.mean(accuracy_list_knn),
            'test_error': test_error_knn,
            'test_mse': test_mse_knn
        },
        'neural_network': {
            'training_mse': np.mean(mse_list_neural_network),
            'training_avg_error': 1 - np.mean(accuracy_list_neural_network),
            'test_error': test_error_neural_network,
            'test_mse': test_mse_neural_network
        },
        'extra_trees': {
            'training_mse': np.mean(mse_list_extra_trees),
            'training_avg_error': 1 - np.mean(accuracy_list_extra_trees),
            'test_error': test_error_extra_trees,
            'test_mse': test_mse_extra_trees
        }
    }

In [3]:
def load_wine_quality():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    data = requests.get(url).text
    df = pd.read_csv(StringIO(data), sep=';')
    X = df.drop('quality', axis=1).values
    y = df['quality'].values
    return X, y


def load_energy_efficiency():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
    df = pd.read_excel(url)
    X = df.drop(['Y1', 'Y2'], axis=1).values
    # Y1 Heating Load
    y = df['Y1'].values
    return X, y


def load_spam():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
    data = requests.get(url).text
    df = pd.read_csv(StringIO(data), header=None)
    X = df.drop(57, axis=1).values
    y = df[57].values
    return X, y


def load_heart_disease():
    # Cleveland Dataset
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
    df = pd.read_csv(url, header=None, na_values='?')
    df = df.dropna()
    X = df.drop(13, axis=1).values
    y = df[13].values
    return X, y

def load_handwritten_digits():
    digits = load_digits()
    df = pd.DataFrame(digits.data, columns=digits.feature_names)
    df['target'] = digits.target

    X = df.drop('target', axis=1).values
    y = df['target'].values
    return X, y


In [4]:
# Load datasets
X_wine, y_wine = load_wine_quality()
X_energy, y_energy = load_energy_efficiency()
X_spam, y_spam = load_spam()
X_heart, y_heart = load_heart_disease()
X_digits, y_digits = load_handwritten_digits()

# Scale datasets
scaler = MinMaxScaler()
X_wine_scaled = scaler.fit_transform(X_wine)
X_energy_scaled = scaler.fit_transform(X_energy)
X_spam_scaled = scaler.fit_transform(X_spam)
X_heart_scaled = scaler.fit_transform(X_heart)
X_digits_scaled = scaler.fit_transform(X_digits)

# Run classifiers for each dataset
results_wine = run_classifiers(X_wine_scaled, y_wine, output = 9)
#results_energy = run_classifiers(X_energy_scaled, y_energy)
results_spam = run_classifiers(X_spam_scaled, y_spam)
results_heart = run_classifiers(X_heart_scaled, y_heart)
results_digits = run_classifiers(X_digits_scaled, y_digits)



2023-04-23 16:50:04.160675: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [10]:
datasets = {
    'Wine Quality': results_wine,
    'Spam': results_spam,
    'Heart Disease': results_heart,
    'Handwritten Digits': results_digits
}

models = ['knn', 'neural_network', 'extra_trees']

for dataset_name, dataset_results in datasets.items():
    print(dataset_name)
    for model_name in models:
        model_results = dataset_results[model_name]
        print(f"{model_name.capitalize()}: Training MSE: {model_results['training_mse']:.4f}, Training Average Error: {model_results['training_avg_error']:.4f}, Test MSE: {model_results['test_mse']:.4f}, Test Error: {model_results['test_error']:.4f}")
    print()

Wine Quality
Knn: Training MSE: 0.6904, Training Average Error: 0.4355, Test MSE: 0.6844, Test Error: 0.4719
Neural_network: Training MSE: 0.4972, Training Average Error: 0.3855, Test MSE: 0.4750, Test Error: 0.4187
Extra_trees: Training MSE: 0.4175, Training Average Error: 0.3112, Test MSE: 0.4031, Test Error: 0.3313

Spam
Knn: Training MSE: 0.1022, Training Average Error: 0.1022, Test MSE: 0.1216, Test Error: 0.1216
Neural_network: Training MSE: 0.0584, Training Average Error: 0.0584, Test MSE: 0.0445, Test Error: 0.0445
Extra_trees: Training MSE: 0.0467, Training Average Error: 0.0467, Test MSE: 0.0434, Test Error: 0.0434

Heart Disease
Knn: Training MSE: 1.3335, Training Average Error: 0.4507, Test MSE: 1.3000, Test Error: 0.3167
Neural_network: Training MSE: 0.9583, Training Average Error: 0.4513, Test MSE: 1.0500, Test Error: 0.3333
Extra_trees: Training MSE: 1.0942, Training Average Error: 0.4346, Test MSE: 1.2500, Test Error: 0.3500

Handwritten Digits
Knn: Training MSE: 0.3371