In [1]:
import os
import struct
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def load_mnist_file(filename, data_dir='data'):
    path = os.path.join(data_dir, filename)
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
        
    with open(path, 'rb') as f:
        magic, num = struct.unpack(">II", f.read(8))
        if magic == 2051: # Images
            rows, cols = struct.unpack(">II", f.read(8))
            data = np.fromfile(f, dtype=np.uint8)
            return data.reshape(num, rows, cols)
        elif magic == 2049: # Labels
            data = np.fromfile(f, dtype=np.uint8)
            return data
        else:
            raise ValueError(f"Invalid magic number {magic} in {filename}")

# Load Raw Data
try:
    X_train_raw = load_mnist_file('train-images.idx3-ubyte')
    y_train = load_mnist_file('train-labels.idx1-ubyte')
    X_test_raw = load_mnist_file('t10k-images.idx3-ubyte')
    y_test = load_mnist_file('t10k-labels.idx1-ubyte')
    print(f"Loaded raw data: Train {X_train_raw.shape}, Test {X_test_raw.shape}")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please make sure the MNIST binary files are in the 'data/' directory.")

Loaded raw data: Train (60000, 28, 28), Test (10000, 28, 28)


In [None]:
# Preprocessing (PyTorch Format)

# 1. Normalize to [0, 1]
X_train = X_train_raw.astype('float32') / 255.0
X_test = X_test_raw.astype('float32') / 255.0

# 2. Flatten for Classical ML & Dense Autoencoders (N, 784)
X_train_flat = X_train.reshape((len(X_train), 784))
X_test_flat = X_test.reshape((len(X_test), 784))

# 3. Reshape for PyTorch Conv Layers: (N, Channels, Height, Width) -> (N, 1, 28, 28)
X_train_img = X_train.reshape((len(X_train), 1, 28, 28))
X_test_img = X_test.reshape((len(X_test), 1, 28, 28))

print("Data preprocessing complete.")
print(f"Flat shape: {X_train_flat.shape}")
print(f"Image shape (PyTorch Channel-First): {X_train_img.shape}")

# Save processed data
np.savez('data/processed_data.npz', 
         X_train_flat=X_train_flat, y_train=y_train,
         X_test_flat=X_test_flat, y_test=y_test,
         X_train_img=X_train_img, X_test_img=X_test_img)
print("Saved processed data to 'data/processed_data.npz'")

Data preprocessing complete.
Flat shape: (60000, 784)
Image shape (PyTorch Channel-First): (60000, 1, 28, 28)
Saved processed data to 'data/processed_data.npz'


In [None]:
# Establish Baseline (Random Forest)
print("Training Baseline Random Forest on raw (784 dimensions)...")
start_time = time.time()

# Random Forest works on flattened numpy arrays, agnostic to Deep Learning framework
clf_baseline = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf_baseline.fit(X_train_flat, y_train)

train_time = time.time() - start_time
print(f"Training Time: {train_time:.2f} seconds")

# Evaluate
y_pred = clf_baseline.predict(X_test_flat)
baseline_acc = accuracy_score(y_test, y_pred)

print(f"Baseline Accuracy: {baseline_acc:.4f}")

# Save baseline metric
with open('results/baseline_metrics.txt', 'w') as f:
    f.write(f"{baseline_acc},{train_time}")


Training Baseline Random Forest on raw (784 dimensions)...
Training Time: 3.34 seconds
Baseline Accuracy: 0.9692
