In [1]:
import numpy as np
import torch
from torch.utils.data import TensorDataset


def engineer_cifar_features(x_flat: np.ndarray) -> np.ndarray:
    assert x_flat.ndim == 2 and x_flat.shape[1] == 3072, "Expected (N, 3072)"

    N = x_flat.shape[0]

    # Normalize to [0, 1]
    x = x_flat.astype(np.float32) / 255.0

    # Reshape to (N, 3, 32, 32)
    x_img = x.reshape(N, 3, 32, 32)

    # ---------- 1) Global color statistics ----------
    # Mean and std per channel
    ch_mean = x_img.mean(axis=(2, 3))           # (N, 3)
    ch_std  = x_img.std(axis=(2, 3))            # (N, 3)

    # ---------- 2) Global brightness / contrast (grayscale) ----------
    # Standard luminance formula
    gray = (
        0.299 * x_img[:, 0] +
        0.587 * x_img[:, 1] +
        0.114 * x_img[:, 2]
    )                                            # (N, 32, 32)

    gray_mean = gray.mean(axis=(1, 2)).reshape(-1, 1)  # (N, 1)
    gray_std  = gray.std(axis=(1, 2)).reshape(-1, 1)   # (N, 1)

    # ---------- 3) 4x4 grid of grayscale means ----------
    grid_size = 4
    h_step = 32 // grid_size  # 8
    w_step = 32 // grid_size  # 8

    grid_feats = []
    for i in range(grid_size):
        for j in range(grid_size):
            patch = gray[:, i*h_step:(i+1)*h_step, j*w_step:(j+1)*w_step]  # (N, 8, 8)
            grid_feats.append(patch.mean(axis=(1, 2)))  # (N,)

    grid_feats = np.stack(grid_feats, axis=1)   # (N, 16)

    # ---------- 4) Color histograms (4 bins per channel) ----------
    # Bins in [0,1]: [0, 0.25, 0.5, 0.75, 1.0]
    bin_edges = np.array([0.0, 0.25, 0.5, 0.75, 1.01], dtype=np.float32)

    hist_feats_list = []
    for c in range(3):
        ch_vals = x_img[:, c].reshape(N, -1)   # (N, 32*32)
        hist_ch = []
        for k in range(len(bin_edges) - 1):
            mask = (ch_vals >= bin_edges[k]) & (ch_vals < bin_edges[k+1])
            # relative frequency of pixels in this bin
            hist_ch.append(mask.mean(axis=1))  # (N,)
        hist_ch = np.stack(hist_ch, axis=1)    # (N, 4)
        hist_feats_list.append(hist_ch)

    hist_feats = np.concatenate(hist_feats_list, axis=1)  # (N, 12)

    # ---------- Concatenate all features ----------
    feats = np.concatenate(
        [ch_mean, ch_std, gray_mean, gray_std, grid_feats, hist_feats],
        axis=1
    ).astype(np.float32)  # (N, 36)

    return feats


def cifar_to_tensors(
    x_flat: np.ndarray,
    y: np.ndarray,
):
    X_np = engineer_cifar_features(x_flat)         # (N, 36)
    y_np = y.astype(np.int64)

    X = torch.from_numpy(X_np)                     # float32
    y_t = torch.from_numpy(y_np)                   # long

    return X, y_t


def cifar_torch_dataset(
    x_flat: np.ndarray,
    y: np.ndarray,
) -> TensorDataset:
    """
    Convenience wrapper returning a TensorDataset.
    """
    X, y_t = cifar_to_tensors(x_flat, y)
    return TensorDataset(X, y_t)


In [2]:
import os
import pickle


def load_cifar_batch(path: str):
    with open(path, "rb") as f:
        batch = pickle.load(f, encoding="latin1")
    data = batch["data"]
    labels = np.array(batch["labels"])
    return data, labels


def load_cifar10(root: str):
    xs, ys = [], []
    for i in range(1, 6):
        batch_path = os.path.join(root, f"data_batch_{i}")
        x, y = load_cifar_batch(batch_path)
        xs.append(x)
        ys.append(y)
    x_train = np.concatenate(xs, axis=0)
    y_train = np.concatenate(ys, axis=0)
    x_test, y_test = load_cifar_batch(os.path.join(root, "test_batch"))
    return x_train, y_train, x_test, y_test


In [3]:
CIFAR_ROOT = "../data/cifar-10-batches-py"
OUTPUT_PATH = "../data/cifar_dataset.pt"

x_train_raw, y_train_raw, x_test_raw, y_test_raw = load_cifar10(CIFAR_ROOT)

X_train, y_train = cifar_to_tensors(x_train_raw, y_train_raw)

X_test, y_test = cifar_to_tensors(x_test_raw, y_test_raw)

torch.save({
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test,
    'classification': True,
    'n_classes': 10,
}, OUTPUT_PATH)

print(f"Dataset saved to {OUTPUT_PATH}")
print(f"Train set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test set shape: X={X_test.shape}, y={y_test.shape}")
print(f"Number of features: {X_train.shape[1]}")


Dataset saved to ../data/cifar_dataset.pt
Train set shape: X=torch.Size([50000, 36]), y=torch.Size([50000])
Test set shape: X=torch.Size([10000, 36]), y=torch.Size([10000])
Number of features: 36
