In [1]:
import numpy as np
import pandas as pd
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt

RAW = "../data/raw/"
PROCESSED = "../data/processed/"
RESULTS = "../results/"

os.makedirs(RAW, exist_ok=True)
os.makedirs(PROCESSED, exist_ok=True)
os.makedirs(RESULTS, exist_ok=True)

loading pre-processed datasets

In [2]:
def load_dataset(path):
    with open(path, "rb") as f:
        return pickle.load(f)  # returns (X, y)

datasets = {
    "UCI Digits": load_dataset(PROCESSED + "uci_scaled.pkl"),
    "MNIST": load_dataset(PROCESSED + "mnist_scaled.pkl"),
    "Pen Digits": load_dataset(PROCESSED + "pen_scaled.pkl"),
    "Semeion": load_dataset(PROCESSED + "semeion_scaled.pkl"),
}

for name, (X, y) in datasets.items():
    print(f"{name}: X={X.shape}, y={y.shape}")

UCI Digits: X=(1797, 64), y=(1797,)
MNIST: X=(70000, 784), y=(70000,)
Pen Digits: X=(10992, 16), y=(10992, 1)
Semeion: X=(1593, 256), y=(1593,)


converting to binary classification problem: "7" would be positive, and all other digits are negative.

In [9]:
def make_binary(y):
    return (y == 7).astype(int)

classifiers

In [28]:
# tuning for decision tree
X_tune, y_tune = datasets["Semeion"]
y_tune = np.ravel(y_tune)
y_tune = make_binary(y_tune)
X_train_tune, X_tmp, y_train_tune, y_tmp = train_test_split(
    X_tune, y_tune, test_size=0.5, random_state=42, stratify=y_tune
)
dt_tune = DecisionTreeClassifier()
dt_tuning = GridSearchCV(
    estimator = dt_tune,
    param_grid = {"max_depth": list(range(1, 11))},
    cv = 5,
    scoring = "accuracy"
)
dt_tuning.fit(X_train_tune, y_train_tune)
best_depth = dt_tuning.best_params_
print(best_depth)

{'max_depth': 5}


In [29]:
splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]
models = {
    "LogisticRegression": (
        LogisticRegression(max_iter=2000),
        {
            "C": [0.001, 0.01, 0.1, 1, 10],
            "penalty": ["l2"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(),
        {
            "max_depth": [5]
        }
    ),
    "SVM": (
        SVC(),
        {
            "C": [0.001, 0.01, 0.1, 1, 10],
            "kernel": ["linear"]
        }
    ),
}

Process for each dataset, each classifier, each split

In [50]:
def run_experiment(X, y, model, split, seed):
    y = make_binary(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split[1], random_state=seed)
    model_instance, param_grid = models[model]
    if "random_state" in model_instance.get_params():
        model_instance.set_params(random_state=seed)
    tuning = GridSearchCV(
        estimator= model_instance,
        param_grid= param_grid,
        cv=3,
        scoring='accuracy'
        )
    tuning.fit(X_train, y_train)
    best_model = tuning.best_estimator_
    best_model.fit(X_train, y_train)
    train_acc = best_model.score(X_train, y_train)
    val_acc = tuning.cv_results_['mean_test_score'][tuning.best_index_]
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    return {
        "best_params": tuning.best_params_,
        "train_accuracy": train_acc,
        "validation_accuracy": val_acc,
        "test_accuracy": test_acc,
        "cv_mean_scores": tuning.cv_results_['mean_test_score'],
        "cv_std_scores": tuning.cv_results_['std_test_score'],
        "cv_params": tuning.cv_results_['params'],
        "y_test": y_test,
        "y_pred": y_pred
    }

train models

In [51]:
# going dataset to dataset
results_list = []

#semeion
dataset = "Semeion"
X, y = datasets[dataset]
y = y.ravel()
for model in models.keys():
    for split in splits:
        for trial in range(3):
            seed = 42 + trial
            result = run_experiment(X, y, model, split, seed)
            result.update({
                "dataset": dataset,
                "model": model,
                "train_ratio": split[0],
                "test_ratio": split[1],
                "seed": seed
            })
            results_list.append(result)


In [52]:
#uci
dataset = "UCI Digits"
X, y = datasets[dataset]
y = y.ravel()
for model in models.keys():
    for split in splits:
        for trial in range(3):
            seed = 42 + trial
            result = run_experiment(X, y, model, split, seed)
            result.update({
                "dataset": dataset,
                "model": model,
                "train_ratio": split[0],
                "test_ratio": split[1],
                "seed": seed
            })
            results_list.append(result)

In [53]:
#Pen
dataset = "Pen Digits"
X, y = datasets[dataset]
y = y.ravel()
for model in models.keys():
    for split in splits:
        for trial in range(3):
            seed = 42 + trial
            result = run_experiment(X, y, model, split, seed)
            result.update({
                "dataset": dataset,
                "model": model,
                "train_ratio": split[0],
                "test_ratio": split[1],
                "seed": seed
            })
            results_list.append(result)

In [57]:
#MNIST
dataset = "MNIST"
X, y = datasets[dataset]
y = y.ravel()
for model in models.keys():
    for split in splits:
        for trial in range(3):
            seed = 42 + trial
            result = run_experiment(X, y, model, split, seed)
            result.update({
                "dataset": dataset,
                "model": model,
                "train_ratio": split[0],
                "test_ratio": split[1],
                "seed": seed
            })
            results_list.append(result)

In [58]:
results_df = pd.DataFrame(results_list)
print(results_df)

                         best_params  train_accuracy  validation_accuracy  \
0         {'C': 10, 'penalty': 'l2'}        1.000000             0.965409   
1         {'C': 10, 'penalty': 'l2'}        1.000000             0.965409   
2         {'C': 10, 'penalty': 'l2'}        1.000000             0.968553   
3        {'C': 0.1, 'penalty': 'l2'}        1.000000             0.977392   
4          {'C': 1, 'penalty': 'l2'}        1.000000             0.977377   
..                               ...             ...                  ...   
103  {'C': 0.01, 'kernel': 'linear'}        0.988057             0.984314   
104  {'C': 0.01, 'kernel': 'linear'}        0.988200             0.983971   
105  {'C': 0.01, 'kernel': 'linear'}        0.987339             0.984464   
106  {'C': 0.01, 'kernel': 'linear'}        0.987321             0.984179   
107  {'C': 0.01, 'kernel': 'linear'}        0.987536             0.984286   

     test_accuracy                                     cv_mean_scores  \
0 

In [11]:
# Old version
'''
results_list = []
for dataset, (X, y) in datasets.items():
    for model in models.keys():
        for split in splits:
            result = run_experiment(X, y, model, split)
            result.update({
                "dataset": dataset,
                "model": model,
                "train_ratio": split[0],
                "test_ratio": split[1]
            })
            results_list.append(result)

results_df = pd.DataFrame(results_list)
print(results_df)
'''

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

                        best_params  accuracy  \
0        {'C': 10, 'penalty': 'l2'}  0.990264   
1        {'C': 10, 'penalty': 'l2'}  0.991101   
2       {'C': 0.1, 'penalty': 'l2'}  0.994444   
3                  {'max_depth': 5}  0.953408   
4                  {'max_depth': 5}  0.964405   
5                  {'max_depth': 5}  0.975000   
6    {'C': 0.1, 'kernel': 'linear'}  0.990264   
7      {'C': 1, 'kernel': 'linear'}  0.991101   
8   {'C': 0.01, 'kernel': 'linear'}  0.994444   
9      {'C': 0.01, 'penalty': 'l2'}  0.981839   
10     {'C': 0.01, 'penalty': 'l2'}  0.982914   
11     {'C': 0.01, 'penalty': 'l2'}  0.984214   
12                 {'max_depth': 5}  0.962411   
13                 {'max_depth': 5}  0.962229   
14                 {'max_depth': 5}  0.962429   
15  {'C': 0.01, 'kernel': 'linear'}  0.983196   
16  {'C': 0.01, 'kernel': 'linear'}  0.983143   
17  {'C': 0.01, 'kernel': 'linear'}  0.985143   
18        {'C': 1, 'penalty': 'l2'}  0.979418   
19       {'C': 10, '

clean results and save

In [59]:
df = results_df.copy()
df["C"] = df["best_params"].apply(lambda d: d.get("C") if isinstance(d, dict) else np.nan)
df["split"] = df.apply(lambda r: f"{r['train_ratio']}/{r['test_ratio']}", axis=1)
clean_df = df[["dataset", "model", "seed", "split", "train_accuracy", "validation_accuracy", "test_accuracy", "C", "cv_mean_scores", "cv_std_scores"]]
print(clean_df)

     dataset               model  seed    split  train_accuracy  \
0    Semeion  LogisticRegression    42  0.2/0.8        1.000000   
1    Semeion  LogisticRegression    43  0.2/0.8        1.000000   
2    Semeion  LogisticRegression    44  0.2/0.8        1.000000   
3    Semeion  LogisticRegression    42  0.5/0.5        1.000000   
4    Semeion  LogisticRegression    43  0.5/0.5        1.000000   
..       ...                 ...   ...      ...             ...   
103    MNIST                 SVM    43  0.5/0.5        0.988057   
104    MNIST                 SVM    44  0.5/0.5        0.988200   
105    MNIST                 SVM    42  0.8/0.2        0.987339   
106    MNIST                 SVM    43  0.8/0.2        0.987321   
107    MNIST                 SVM    44  0.8/0.2        0.987536   

     validation_accuracy  test_accuracy      C  \
0               0.965409       0.975686  10.00   
1               0.965409       0.959216  10.00   
2               0.968553       0.969412  10.0

In [60]:
clean_df.to_csv("../results/clean_df.csv", index=False)
results_df.to_csv("../results/results_df.csv", index=False)

Addition: trying to recover y_pred and y_test...

In [61]:
print(results_list)

[{'best_params': {'C': 10, 'penalty': 'l2'}, 'train_accuracy': 1.0, 'validation_accuracy': np.float64(0.9654088050314465), 'test_accuracy': 0.9756862745098039, 'cv_mean_scores': array([0.89937107, 0.93081761, 0.95597484, 0.96226415, 0.96540881]), 'cv_std_scores': array([0.00444721, 0.00444721, 0.02223606, 0.01334164, 0.01603465]), 'cv_params': [{'C': 0.001, 'penalty': 'l2'}, {'C': 0.01, 'penalty': 'l2'}, {'C': 0.1, 'penalty': 'l2'}, {'C': 1, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l2'}], 'y_test': array([0, 0, 0, ..., 0, 1, 0]), 'y_pred': array([0, 0, 0, ..., 0, 1, 0]), 'dataset': 'Semeion', 'model': 'LogisticRegression', 'train_ratio': 0.2, 'test_ratio': 0.8, 'seed': 42}, {'best_params': {'C': 10, 'penalty': 'l2'}, 'train_accuracy': 1.0, 'validation_accuracy': np.float64(0.9654088050314465), 'test_accuracy': 0.9592156862745098, 'cv_mean_scores': array([0.90880503, 0.94339623, 0.9591195 , 0.96226415, 0.96540881]), 'cv_std_scores': array([0.00444721, 0.0077028 , 0.00444721, 0.        , 

In [63]:
len(results_list[0]['y_test'])

1275

In [65]:
df = pd.DataFrame(results_list)
df['y_test'] = df['y_test'].apply(lambda x: x.tolist())
df['y_pred'] = df['y_pred'].apply(lambda x: x.tolist())
df.to_csv("../results/results_list.csv", index=False)