# Introduction to Machine Learning - Practical Assignment 2
## Questions 2 & 3 across 5 datasets (10x10 CV + T-tests)

This notebook runs:
- Question 2: 8 algorithms (no L1 NN) on 5 datasets using 10x10 CV.
- Question 3: Paired t-tests per dataset (WIN/TIE/LOSS matrices) and overall aggregation.

Datasets used (from `limited` folder):
- Taiwan Bankruptcy (Binary bankruptcy prediction)
- Breast Cancer (Binary cancer diagnosis: M/B)
- Biodegradation (Binary biodegradability prediction)
- Autism Screening (Binary autism diagnosis)
- Student Data (Binary academic performance prediction)

In [1]:
# =============================================================================
# CELL 1: Imports, setup, and Kaggle/Colab notes
# =============================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# For Colab (commented):
# from google.colab import drive
# drive.mount('/content/drive')
# base_path = '/content/drive/MyDrive/imlpa2/data'

# For Local:
# base_path = 'limited'

# For Kaggle
base_path = '/kaggle/input/binarylimited'

In [2]:
!ls /kaggle/input/binarylimited

Autism-Adult-Data.csv  breastcancer.csv  taiwanesebankruptcyprediction.csv
biodeg.csv	       student_data.csv


In [3]:
# =============================================================================
# CELL 2: Load and preprocess all datasets
# =============================================================================

def load_all_datasets(base_path: str):
    datasets = {}

    print("Loading Taiwan Bankruptcy...")
    taiwan = pd.read_csv(f"{base_path}/taiwanesebankruptcyprediction.csv")
    y_taiwan = taiwan.iloc[:, 0]
    X_taiwan = taiwan.iloc[:, 1:]
    datasets['Taiwan_Bankruptcy'] = (X_taiwan, y_taiwan)

    print("Loading Breast Cancer...")
    bc = pd.read_csv(f"{base_path}/breastcancer.csv")
    bc['Diagnosis'] = (bc['Diagnosis'] == 'M').astype(int)
    X_bc = bc[[c for c in bc.columns if c not in ['ID', 'Diagnosis']]]
    y_bc = bc['Diagnosis']
    datasets['Breast_Cancer'] = (X_bc, y_bc)

    print("Loading Biodegradation...")
    biodeg = pd.read_csv(f"{base_path}/biodeg.csv")
    X_biodeg = biodeg.iloc[:, :-1]
    y_biodeg = biodeg.iloc[:, -1]
    datasets['Biodegradation'] = (X_biodeg, y_biodeg)

    print("Loading Autism Screening...")
    autism = pd.read_csv(f"{base_path}/Autism-Adult-Data.csv")
    X_autism = autism.iloc[:, :-1]
    y_autism = autism.iloc[:, -1]
    datasets['Autism_Screening'] = (X_autism, y_autism)

    print("Loading Student Data...")
    student = pd.read_csv(f"{base_path}/student_data.csv")
    X_student = student.iloc[:, :-1]
    y_student = student.iloc[:, -1]
    datasets['Student_Data'] = (X_student, y_student)

    for name, (Xn, yn) in datasets.items():
        print(f"- {name}: samples={Xn.shape[0]}, features={Xn.shape[1]}, classes={len(np.unique(yn))}")

    return datasets

all_datasets = load_all_datasets(base_path)

Loading Taiwan Bankruptcy...
Loading Breast Cancer...
Loading Biodegradation...
Loading Autism Screening...
Loading Student Data...
- Taiwan_Bankruptcy: samples=500, features=95, classes=2
- Breast_Cancer: samples=500, features=30, classes=2
- Biodegradation: samples=500, features=41, classes=2
- Autism_Screening: samples=500, features=91, classes=2
- Student_Data: samples=500, features=36, classes=2


In [4]:
# =============================================================================
# CELL 3: Define algorithms (without L1 NN)
# =============================================================================

def get_algorithms():
    return {
        'Decision Tree (C4.5)': DecisionTreeClassifier(criterion='entropy', random_state=42),
        'Decision Tree (CART)': DecisionTreeClassifier(criterion='gini', random_state=42),
        'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
        'SVM (RBF Kernel)': SVC(kernel='rbf', random_state=42, probability=True),
        'Naive Bayes': GaussianNB(),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        '3-Layer NN': MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=1000),
        '3-Layer NN (L2)': MLPClassifier(hidden_layer_sizes=(100, 50), alpha=0.01, random_state=42, max_iter=1000),
    }

algorithms = get_algorithms()
print("Algorithms:")
for k in algorithms.keys():
    print(" -", k)


Algorithms:
 - Decision Tree (C4.5)
 - Decision Tree (CART)
 - SVM (Linear)
 - SVM (RBF Kernel)
 - Naive Bayes
 - Logistic Regression
 - 3-Layer NN
 - 3-Layer NN (L2)


In [5]:
# =============================================================================
# CELL 4: 10x10 CV evaluation utilities and saving 10x10 tables
# =============================================================================

def evaluate_algorithm(X, y, algorithm, algorithm_name, return_grid=False):
    run_accuracies, run_f1s, run_aucs = [], [], []
    acc_grid = np.zeros((10, 10))  # 10 runs x 10 folds
    f1_grid = np.zeros((10, 10))
    auc_grid = np.zeros((10, 10))
    
    for run_idx in range(10):
        run_fold_accs, run_fold_f1s, run_fold_aucs = [], [], []
        
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=run_idx)
        for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            
            if hasattr(y, 'iloc'):
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            else:
                y_train, y_test = y[train_idx], y[test_idx]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            imputer = SimpleImputer(strategy='mean')
            X_train_scaled = imputer.fit_transform(X_train_scaled)
            X_test_scaled = imputer.transform(X_test_scaled)

            try:
                if algorithm_name in ['SVM (Linear)', 'SVM (RBF Kernel)', 'Logistic Regression', '3-Layer NN', '3-Layer NN (L2)']:
                    algorithm.fit(X_train_scaled, y_train)
                    y_pred = algorithm.predict(X_test_scaled)
                    y_proba = algorithm.predict_proba(X_test_scaled)
                else:
                    algorithm.fit(X_train, y_train)
                    y_pred = algorithm.predict(X_test)
                    y_proba = algorithm.predict_proba(X_test)
            
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
            
                auc = 0.0
                unique_classes = np.unique(y)
                unique_test_classes = np.unique(y_test)
                if len(unique_classes) == 2:
                    auc = roc_auc_score(y_test, y_proba[:, 1])
                elif len(unique_test_classes) == len(unique_classes):
                    auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
                else:
                    auc = np.nan
            
                run_fold_accs.append(acc)
                run_fold_f1s.append(f1)
                run_fold_aucs.append(auc)
            
                acc_grid[run_idx, fold_idx] = acc
                f1_grid[run_idx, fold_idx] = f1
                auc_grid[run_idx, fold_idx] = auc
            except Exception as e:
                print(f"Error in {algorithm_name}: {str(e)}")
                run_fold_accs.append(0.0)
                run_fold_f1s.append(0.0)
                run_fold_aucs.append(np.nan)
            
        if len(run_fold_accs) > 0:
            run_accuracies.append(float(np.mean(run_fold_accs)))
            run_f1s.append(float(np.mean(run_fold_f1s)))
            run_aucs.append(float(np.mean(run_fold_aucs)))
    
    summary = {
        'accuracy': float(np.mean(run_accuracies)),
        'f1_score': float(np.mean(run_f1s)),
        'auc': float(np.mean(run_aucs)),
        'std_accuracy': float(np.std(run_accuracies)),
        'std_f1': float(np.std(run_f1s)),
        'std_auc': float(np.std(run_aucs)),
    }
    if return_grid:
        return summary, acc_grid, f1_grid, auc_grid
    return summary

    safe_ds = dataset_name.replace(' ', '_')

def save_metric_grids(dataset_name, algorithm_name, acc_grid, f1_grid, auc_grid):
    safe_alg = algorithm_name.replace(' ', '_').replace('(', '').replace(')', '')
    safe_ds = dataset_name.replace(' ', '_')

    safe_ds = dataset_name.replace(' ', '_')
    pd.DataFrame(acc_grid).to_csv(f"{safe_ds}__{safe_alg}__acc_grid.csv", index=False)
    safe_alg = algorithm_name.replace(' ', '_').replace('(', '').replace(')', '')

    safe_alg = algorithm_name.replace(' ', '_').replace('(', '').replace(')', '')
    pd.DataFrame(f1_grid).to_csv(f"{safe_ds}__{safe_alg}__f1_grid.csv", index=False)
    pd.DataFrame(acc_grid).to_csv(f"{safe_ds}__{safe_alg}__acc_grid.csv", index=False)    
    pd.DataFrame(auc_grid).to_csv(f"{safe_ds}__{safe_alg}__auc_grid.csv", index=False)

    pd.DataFrame(acc_grid).to_csv(f"{safe_ds}__{safe_alg}__acc_grid.csv", index=False)
    pd.DataFrame(auc_grid).to_csv(f"{safe_ds}__{safe_alg}__auc_grid.csv", index=False)
    pd.DataFrame(f1_grid).to_csv(f"{safe_ds}__{safe_alg}__f1_grid.csv", index=False)    
    pd.DataFrame(auc_grid).to_csv(f"{safe_ds}__{safe_alg}__auc_grid.csv", index=False)

    pd.DataFrame(f1_grid).to_csv(f"{safe_ds}__{safe_alg}__f1_grid.csv", index=False)

In [6]:
# =============================================================================
# CELL 5: Run Question 2 for all datasets, save summaries and 10x10 tables
# =============================================================================

dataset_results = {}
for dataset_name, (Xd, yd) in all_datasets.items():
    print(f"\n{'='*60}\nDATASET: {dataset_name}\n{'='*60}")
    dataset_results[dataset_name] = {}
    for alg_name, alg in get_algorithms().items():
        print(f"- {alg_name}")
        from sklearn.base import clone
        alg_copy = clone(alg)
        summary, acc_grid, f1_grid, auc_grid = evaluate_algorithm(Xd, yd, alg_copy, alg_name, return_grid=True)
        dataset_results[dataset_name][alg_name] = summary
        save_metric_grids(dataset_name, alg_name, acc_grid, f1_grid, auc_grid)
        print(f"  Acc {summary['accuracy']:.4f}±{summary['std_accuracy']:.4f} | F1 {summary['f1_score']:.4f}±{summary['std_f1']:.4f} | AUC {summary['auc']:.4f}±{summary['std_auc']:.4f}")

for dataset_name, alg_metrics in dataset_results.items():
    rows = []
    for alg_name, m in alg_metrics.items():
        rows.append({
            'Algorithm': alg_name,
            'Average Accuracy': f"{m['accuracy']:.4f} ± {m['std_accuracy']:.4f}",
            'Average F-Measure': f"{m['f1_score']:.4f} ± {m['std_f1']:.4f}",
            'Average AUC': f"{m['auc']:.4f} ± {m['std_auc']:.4f}",
        })
    df = pd.DataFrame(rows)
    df['Acc_Value'] = [float(x.split(' ±')[0]) for x in df['Average Accuracy']]
    df = df.sort_values('Acc_Value', ascending=False).drop('Acc_Value', axis=1)
    print(f"\nSummary: {dataset_name}")
    print(df.to_string(index=False))
    df.to_csv(f"{dataset_name}__summary.csv", index=False)



DATASET: Taiwan_Bankruptcy
- Decision Tree (C4.5)
  Acc 0.9488±0.0065 | F1 0.9474±0.0060 | AUC 0.7691±0.0294
- Decision Tree (CART)
  Acc 0.9442±0.0067 | F1 0.9430±0.0068 | AUC 0.7588±0.0310
- SVM (Linear)
  Acc 0.9424±0.0055 | F1 0.9404±0.0053 | AUC 0.8449±0.0103
- SVM (RBF Kernel)
  Acc 0.9370±0.0018 | F1 0.9093±0.0024 | AUC 0.9081±0.0057
- Naive Bayes
  Acc 0.8480±0.0403 | F1 0.8354±0.0370 | AUC 0.5791±0.0144
- Logistic Regression
  Acc 0.9498±0.0038 | F1 0.9451±0.0040 | AUC 0.8641±0.0101
- 3-Layer NN
  Acc 0.9546±0.0036 | F1 0.9461±0.0039 | AUC 0.8463±0.0135
- 3-Layer NN (L2)
  Acc 0.9550±0.0038 | F1 0.9465±0.0042 | AUC 0.8475±0.0134

DATASET: Breast_Cancer
- Decision Tree (C4.5)
  Acc 0.9246±0.0064 | F1 0.9246±0.0064 | AUC 0.9221±0.0067
- Decision Tree (CART)
  Acc 0.9252±0.0109 | F1 0.9250±0.0110 | AUC 0.9212±0.0117
- SVM (Linear)
  Acc 0.9756±0.0012 | F1 0.9755±0.0012 | AUC 0.9941±0.0014
- SVM (RBF Kernel)
  Acc 0.9748±0.0020 | F1 0.9747±0.0021 | AUC 0.9945±0.0009
- Naive Bayes

In [7]:
# =============================================================================
# CELL 6: Question 3 - T-tests using saved grid files
# =============================================================================

def load_metric_grids(dataset_name, algorithm_name):
    """Load the saved 10x10 grid files for a dataset/algorithm pair"""
    safe_ds = dataset_name.replace(' ', '_')
    safe_alg = algorithm_name.replace(' ', '_').replace('(', '').replace(')', '')
    
    try:
        acc_grid = pd.read_csv(f"{safe_ds}__{safe_alg}__acc_grid.csv").values
        f1_grid = pd.read_csv(f"{safe_ds}__{safe_alg}__f1_grid.csv").values
        auc_grid = pd.read_csv(f"{safe_ds}__{safe_alg}__auc_grid.csv").values
        
        acc_means = np.mean(acc_grid, axis=1)  
        f1_means = np.mean(f1_grid, axis=1)
        auc_means = np.mean(auc_grid, axis=1)
        
        return {
            'accuracies': acc_means.tolist(),
            'f1_scores': f1_means.tolist(),
            'auc_scores': auc_means.tolist()
        }
    except Exception as e:
        print(f"Error loading grids for {dataset_name} - {algorithm_name}: {str(e)}")
        return {
            'accuracies': [0.0] * 10,
            'f1_scores': [0.0] * 10, 
            'auc_scores': [0.0] * 10
        }

def collect_fold_scores_from_files(dataset_name):
    """Collect algorithm scores from saved grid files instead of re-running"""
    results = {}
    for alg_name in get_algorithms().keys():
        results[alg_name] = load_metric_grids(dataset_name, alg_name)
    return results

def create_ttest_matrix(algorithm_fold_results, metric='accuracies', alpha=0.05):
    """Create WIN/TIE/LOSS matrix from fold results using paired t-test"""
    names = list(algorithm_fold_results.keys())
    n = len(names)
    win = np.zeros((n, n))
    tie = np.zeros((n, n))
    loss = np.zeros((n, n))
    
    for i, a in enumerate(names):
        for j, b in enumerate(names):
            if i == j:
                tie[i, j] = 1
                continue
            r1 = algorithm_fold_results[a][metric]
            r2 = algorithm_fold_results[b][metric]
            
            t, p = stats.ttest_rel(r1, r2)
            if p < alpha:
                if np.mean(r1) > np.mean(r2):
                    win[i, j] = 1
                    loss[j, i] = 1
                else:
                    loss[i, j] = 1
                    win[j, i] = 1
            else:
                tie[i, j] = 1
                tie[j, i] = 1
    return win, tie, loss, names

# Process each dataset
print("Starting t-test analysis using saved grid files...")
datasets = list(all_datasets.keys())
overall_wtl = {}
alg_names_ref = None

for dataset_name in datasets:
    print(f"\nProcessing t-tests for: {dataset_name}")
    
    fold_scores = collect_fold_scores_from_files(dataset_name)
    
    w_acc, t_acc, l_acc, names = create_ttest_matrix(fold_scores, 'accuracies')
    w_f1, t_f1, l_f1, _ = create_ttest_matrix(fold_scores, 'f1_scores')
    w_auc, t_auc, l_auc, _ = create_ttest_matrix(fold_scores, 'auc_scores')
    
    if alg_names_ref is None:
        alg_names_ref = names
        
    def save_matrix(prefix, w, t, l):
        pd.DataFrame(w, index=names, columns=names).to_csv(f"{dataset_name}__{prefix}__win.csv")
        pd.DataFrame(t, index=names, columns=names).to_csv(f"{dataset_name}__{prefix}__tie.csv")
        pd.DataFrame(l, index=names, columns=names).to_csv(f"{dataset_name}__{prefix}__loss.csv")
    
    save_matrix('ACC', w_acc, t_acc, l_acc)
    save_matrix('F1', w_f1, t_f1, l_f1)
    save_matrix('AUC', w_auc, t_auc, l_auc)
    
    for i, ai in enumerate(names):
        for j, aj in enumerate(names):
            if i == j:
                continue
            key = (ai, aj)
            if key not in overall_wtl:
                overall_wtl[key] = {'W': 0, 'T': 0, 'L': 0}
            
            for w, t, l in [(w_acc, t_acc, l_acc), (w_f1, t_f1, l_f1), (w_auc, t_auc, l_auc)]:
                if w[i, j] == 1:
                    overall_wtl[key]['W'] += 1
                elif l[i, j] == 1:
                    overall_wtl[key]['L'] += 1
                elif t[i, j] == 1:
                    overall_wtl[key]['T'] += 1

print("\nGenerating overall WIN-TIE-LOSS summary...")
rows = []
for i, ai in enumerate(alg_names_ref):
    row = {'Algorithm': ai}
    for j, aj in enumerate(alg_names_ref):
        if i == j:
            row[aj] = '—'
        else:
            wtl = overall_wtl.get((ai, aj), {'W': 0, 'T': 0, 'L': 0})
            row[aj] = f"{wtl['W']}-{wtl['T']}-{wtl['L']}"
    rows.append(row)

overall_df = pd.DataFrame(rows)
print(overall_df.to_string(index=False))
overall_df.to_csv('OVERALL_WTL.csv', index=False)
print("\nDone. All CSV outputs are saved in the working directory.")

Starting t-test analysis using saved grid files...

Processing t-tests for: Taiwan_Bankruptcy

Processing t-tests for: Breast_Cancer

Processing t-tests for: Biodegradation

Processing t-tests for: Autism_Screening

Processing t-tests for: Student_Data

Generating overall WIN-TIE-LOSS summary...
           Algorithm Decision Tree (C4.5) Decision Tree (CART) SVM (Linear) SVM (RBF Kernel) Naive Bayes Logistic Regression 3-Layer NN 3-Layer NN (L2)
Decision Tree (C4.5)                    —               0-15-0       4-1-10           5-0-10       8-2-5              3-2-10     3-1-11          3-1-11
Decision Tree (CART)               0-15-0                    —       2-3-10           5-0-10       7-1-7              3-1-11     3-1-11          3-1-11
        SVM (Linear)               10-1-4               10-3-2            —            8-5-2      15-0-0               5-6-4      6-5-4           6-4-5
    SVM (RBF Kernel)               10-0-5               10-0-5        2-5-8                —   