# Face Recognition (AT&T / ORL) ‚Äî 2 Pipelines in Google Colab

**Pipeline A (Baseline):** PCA (‚ÄúEigenfaces-style‚Äù) + Linear SVM  
**Pipeline B (Winner):** Pretrained CNN embeddings (ResNet18, feature extractor only) + Linear SVM  

‚úÖ Offline evaluation only (accuracy, macro P/R/F1, confusion matrix, runtime)  
‚úÖ Same identity-stratified split for both pipelines (7 train / 3 test per person)  
‚úÖ Extra comparison graphs (per-seed + aggregate)

> Run the notebook **top ‚Üí bottom** on a fresh runtime.


In [None]:
# CELL 0 ‚Äî Setup + Settings + Utilities

import os, sys, time, zipfile, shutil, math
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# -----------------------------
# Experiment settings
# -----------------------------
SEEDS = [42, 123, 256, 789, 1024]
N_TEST_PER_SUBJECT = 3      # ORL has 10 images/subject -> 7 train / 3 test
APPLY_HIST_EQ = True        # histogram equalization on grayscale

RESULTS_DIR = Path("/content/results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Pipeline A grids
PCA_COMPONENTS_GRID = [20, 40, 60, 80, 100, 120, 150]
SVM_C_GRID = [0.1, 1, 10, 100]

# Embedding extraction
EMB_BATCH_SIZE = 64

# -----------------------------
# Helpers
# -----------------------------
def eval_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro", zero_division=0
    )
    return acc, p, r, f1

def per_class_accuracy(cm):
    # cm: rows=true, cols=pred
    denom = cm.sum(axis=1)
    denom = np.maximum(denom, 1)
    return np.diag(cm) / denom

def save_confusion(y_true, y_pred, title, outpath: Path, n_classes):
    labels = list(range(n_classes))
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    fig, ax = plt.subplots(figsize=(10, 10))
    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm,
        display_labels=[f"s{i+1}" for i in labels]
    )
    disp.plot(ax=ax, cmap="Blues", colorbar=False, xticks_rotation="vertical")
    ax.set_title(title)
    ax.tick_params(axis="x", labelsize=6)
    ax.tick_params(axis="y", labelsize=6)
    plt.tight_layout()
    fig.savefig(outpath, dpi=200)
    plt.show()
    return cm

def show_grid(X, y, n=20, cols=10, seed=0):
    rng = np.random.default_rng(seed)
    idx = rng.choice(len(X), size=min(n, len(X)), replace=False)
    rows = math.ceil(len(idx) / cols)
    plt.figure(figsize=(cols*1.2, rows*1.2))
    for i, k in enumerate(idx, 1):
        plt.subplot(rows, cols, i)
        plt.imshow(X[k], cmap="gray")
        plt.title(f"s{y[k]+1}", fontsize=8)
        plt.axis("off")
    plt.tight_layout()
    plt.show()

def split_per_identity(y, seed: int, n_test_per_subject: int):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    train_idx, test_idx = [], []
    for cls in np.unique(y):
        idx = np.where(y == cls)[0].copy()
        rng.shuffle(idx)
        test = idx[:n_test_per_subject]
        train = idx[n_test_per_subject:]
        train_idx.extend(train.tolist())
        test_idx.extend(test.tolist())
    return np.array(sorted(train_idx), dtype=np.int64), np.array(sorted(test_idx), dtype=np.int64)

def find_att_root(search_root: Path):
    # Find a folder that contains s1..s40 directories with .pgm files inside.
    candidates = [search_root] + [p for p in search_root.rglob("*") if p.is_dir()]
    for root in candidates:
        try:
            s_dirs = [p for p in root.iterdir() if p.is_dir() and p.name.startswith("s")]
        except Exception:
            continue
        if len(s_dirs) < 40:
            continue
        s1 = root / "s1"
        if not s1.exists():
            continue
        if len(list(s1.glob("*.pgm"))) == 0:
            continue
        return root
    return None

print("‚úÖ CELL 0 ready")
print("RESULTS_DIR:", RESULTS_DIR)

In [None]:
# CELL 1 ‚Äî Runtime sanity check

import cv2, sklearn, torch, torchvision

print("Python:", sys.version)
print("NumPy:", np.__version__)
print("OpenCV:", cv2.__version__)
print("scikit-learn:", sklearn.__version__)
print("Torch:", torch.__version__)
print("Torchvision:", torchvision.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# CELL 2 ‚Äî Upload AT&T/ORL Faces ZIP and extract (auto-detect root)

from google.colab import files

DATA_PARENT = Path("/content/data")
DATA_PARENT.mkdir(parents=True, exist_ok=True)

print("Upload ORL/AT&T faces ZIP (must contain s1..s40 folders somewhere inside).")
uploaded = files.upload()

zip_files = [n for n in uploaded.keys() if n.lower().endswith(".zip")]
assert len(zip_files) > 0, "‚ùå No ZIP uploaded. Upload the ORL/AT&T ZIP."

zip_path = Path("/content") / zip_files[0]
extract_to = DATA_PARENT / "att_extracted"
if extract_to.exists():
    shutil.rmtree(extract_to)
extract_to.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_to)

DATA_ROOT = find_att_root(extract_to)
assert DATA_ROOT is not None, "‚ùå Could not find s1..s40 with .pgm inside the uploaded ZIP."

print("‚úÖ Detected DATA_ROOT:", DATA_ROOT)
print("Example folders:", sorted([p.name for p in DATA_ROOT.iterdir() if p.is_dir()])[:12])

In [None]:
# CELL 3 ‚Äî Load dataset (memory-safe), show samples

import cv2

def load_att_faces(root: Path, hist_eq: bool = True):
    images, labels, paths = [], [], []
    subject_dirs = sorted(
        [p for p in root.iterdir() if p.is_dir() and p.name.startswith("s")],
        key=lambda p: int(p.name[1:])
    )
    for s_idx, sdir in enumerate(subject_dirs):
        for f in sorted(list(sdir.glob("*.pgm"))):
            img = cv2.imread(str(f), cv2.IMREAD_GRAYSCALE)
            if img is None:
                continue
            if hist_eq:
                img = cv2.equalizeHist(img)
            images.append(img.astype(np.uint8))
            labels.append(s_idx)
            paths.append(str(f))
    X = np.stack(images, axis=0)         # (N,H,W)
    y = np.array(labels, dtype=np.int64) # (N,)
    return X, y, paths

X_img, y, img_paths = load_att_faces(DATA_ROOT, hist_eq=APPLY_HIST_EQ)
n_classes = len(np.unique(y))

print("Loaded images:", X_img.shape)
print("Unique subjects:", n_classes)
print("Image shape:", X_img[0].shape, "dtype:", X_img.dtype)

assert n_classes == 40, "Expected 40 subjects (s1..s40)."

show_grid(X_img, y, n=20, cols=10, seed=1)

In [None]:
# CELL 4 ‚Äî Identity-stratified split (same split reused for both pipelines)

SPLITS = {}
for s in SEEDS:
    tr, te = split_per_identity(y, seed=s, n_test_per_subject=N_TEST_PER_SUBJECT)
    SPLITS[s] = {"train_idx": tr, "test_idx": te}

print("Example split (seed=42):")
print("Train size:", len(SPLITS[42]["train_idx"]), "Test size:", len(SPLITS[42]["test_idx"]))
assert len(SPLITS[42]["test_idx"]) == 40 * N_TEST_PER_SUBJECT

## Pipeline A ‚Äî PCA (Eigenfaces-style) + Linear SVM

In [None]:
# CELL 5 ‚Äî Preprocess for PCA

from sklearn.preprocessing import FunctionTransformer, StandardScaler

X_flat = X_img.reshape(len(X_img), -1)  # (N, 112*92=10304)
to_float01 = FunctionTransformer(lambda Z: (Z.astype(np.float32) / 255.0), feature_names_out="one-to-one")
center_only = StandardScaler(with_std=False)

print("Flattened:", X_flat.shape)

In [None]:
# CELL 6 ‚Äî Tune PCA components + SVM + plot PCA-components curve (seed=42)

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

def build_pipeline_a(seed: int):
    return Pipeline([
        ("to_float01", to_float01),
        ("center", center_only),
        ("pca", PCA(whiten=True, svd_solver="randomized", random_state=seed)),
        ("svm", SVC(kernel="linear"))
    ])

def tune_pipeline_a(X, y, train_idx, seed: int):
    Xtr, ytr = X[train_idx], y[train_idx]
    pipe = build_pipeline_a(seed)

    param_grid = {
        "pca__n_components": PCA_COMPONENTS_GRID,
        "svm__C": SVM_C_GRID
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    gs = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=-1, verbose=0)
    gs.fit(Xtr, ytr)
    return gs

# CV curve (seed=42, fixed C=1)
seed_curve = 42
tr_curve = SPLITS[seed_curve]["train_idx"]
Xtr_curve, ytr_curve = X_flat[tr_curve], y[tr_curve]

curve_scores = []
for k in PCA_COMPONENTS_GRID:
    pipe = build_pipeline_a(seed_curve)
    pipe.set_params(pca__n_components=k, svm__C=1)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_curve)
    gs = GridSearchCV(pipe, param_grid={"pca__n_components":[k], "svm__C":[1]}, scoring="accuracy", cv=cv, n_jobs=-1, verbose=0)
    gs.fit(Xtr_curve, ytr_curve)
    curve_scores.append(gs.best_score_)

plt.figure(figsize=(6,4))
plt.plot(PCA_COMPONENTS_GRID, curve_scores, marker="o")
plt.xlabel("PCA components")
plt.ylabel("CV accuracy (seed=42, C=1)")
plt.title("Pipeline A: PCA components vs CV accuracy")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(RESULTS_DIR / "pca_components_curve_seed42.png", dpi=200)
plt.show()

print("Saved:", RESULTS_DIR / "pca_components_curve_seed42.png")

In [None]:
# CELL 7 ‚Äî Train + evaluate Pipeline A (all seeds) + confusion matrix (seed=42)

PIPE_A_RUNS = []
PIPE_A_CM_SEED42 = None

for seed in SEEDS:
    tr = SPLITS[seed]["train_idx"]
    te = SPLITS[seed]["test_idx"]

    gs = tune_pipeline_a(X_flat, y, tr, seed=seed)
    model = gs.best_estimator_

    Xtr, ytr = X_flat[tr], y[tr]
    Xte, yte = X_flat[te], y[te]

    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    train_s = time.perf_counter() - t0

    t1 = time.perf_counter()
    yhat = model.predict(Xte)
    infer_s = time.perf_counter() - t1

    acc, p, r, f1 = eval_metrics(yte, yhat)

    PIPE_A_RUNS.append({
        "pipeline": "A_PCA+LinearSVM",
        "seed": seed,
        "best_params": gs.best_params_,
        "acc": float(acc),
        "macro_precision": float(p),
        "macro_recall": float(r),
        "macro_f1": float(f1),
        "train_s": float(train_s),
        "infer_ms_per_img": float((infer_s / len(Xte)) * 1000.0),
    })

    if seed == 42:
        PIPE_A_CM_SEED42 = save_confusion(
            y_true=yte, y_pred=yhat,
            title="Pipeline A ‚Äî PCA + Linear SVM (seed=42)",
            outpath=RESULTS_DIR / "cm_pipelineA_seed42.png",
            n_classes=n_classes
        )

print("‚úÖ Pipeline A runs:", len(PIPE_A_RUNS))

## Pipeline B ‚Äî Transfer Learning Embeddings (ResNet18) + Linear SVM

In [None]:
# CELL 8 ‚Äî Load pretrained embedding model (ResNet18, feature extractor only)

import torch
import torchvision
import torch.nn as nn
from torchvision.models import resnet18, ResNet18_Weights
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = ResNet18_Weights.DEFAULT

base = resnet18(weights=weights)
embedder = nn.Sequential(*list(base.children())[:-1]).eval().to(device)  # outputs (B,512,1,1)
preprocess = weights.transforms()

def gray_to_rgb_pil(gray_uint8: np.ndarray) -> Image.Image:
    return Image.fromarray(gray_uint8).convert("RGB")

print("‚úÖ Embedding model: ResNet18 (ImageNet pretrained)")
print("Device:", device)

In [None]:
# CELL 9 ‚Äî Compute embeddings (batched, memory-safe)

def compute_resnet_embeddings(X_gray_uint8: np.ndarray, batch_size: int = 64):
    embs = []
    with torch.no_grad():
        for start in range(0, len(X_gray_uint8), batch_size):
            batch = X_gray_uint8[start:start+batch_size]
            batch_t = torch.stack([preprocess(gray_to_rgb_pil(im)) for im in batch], dim=0).to(device)

            feat = embedder(batch_t)            # (B,512,1,1)
            feat = feat.view(feat.size(0), -1)  # (B,512)
            feat = torch.nn.functional.normalize(feat, p=2, dim=1)
            embs.append(feat.cpu().numpy().astype(np.float32))
    return np.vstack(embs)

t0 = time.perf_counter()
X_emb = compute_resnet_embeddings(X_img, batch_size=EMB_BATCH_SIZE)
t1 = time.perf_counter()

print("Embeddings:", X_emb.shape, X_emb.dtype)
print(f"Embedding extraction: {t1-t0:.2f}s total, {(t1-t0)/len(X_emb)*1000:.2f} ms/img")

In [None]:
# CELL 10 ‚Äî Tune + train + evaluate Pipeline B (all seeds) + confusion matrix (seed=42)

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def tune_pipeline_b(Xemb, y, train_idx, seed: int):
    Xtr, ytr = Xemb[train_idx], y[train_idx]
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="linear"))
    ])
    param_grid = {"svm__C": SVM_C_GRID}
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    gs = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy", cv=cv, n_jobs=-1, verbose=0)
    gs.fit(Xtr, ytr)
    return gs

PIPE_B_RUNS = []
PIPE_B_CM_SEED42 = None

for seed in SEEDS:
    tr = SPLITS[seed]["train_idx"]
    te = SPLITS[seed]["test_idx"]

    gs = tune_pipeline_b(X_emb, y, tr, seed=seed)
    model = gs.best_estimator_

    Xtr, ytr = X_emb[tr], y[tr]
    Xte, yte = X_emb[te], y[te]

    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    train_s = time.perf_counter() - t0

    t1 = time.perf_counter()
    yhat = model.predict(Xte)
    infer_s = time.perf_counter() - t1

    acc, p, r, f1 = eval_metrics(yte, yhat)

    PIPE_B_RUNS.append({
        "pipeline": "B_ResNet18Embeddings+LinearSVM",
        "seed": seed,
        "best_params": gs.best_params_,
        "acc": float(acc),
        "macro_precision": float(p),
        "macro_recall": float(r),
        "macro_f1": float(f1),
        "train_s": float(train_s),
        "infer_ms_per_img": float((infer_s / len(Xte)) * 1000.0),
    })

    if seed == 42:
        PIPE_B_CM_SEED42 = save_confusion(
            y_true=yte, y_pred=yhat,
            title="Pipeline B ‚Äî ResNet18 embeddings + Linear SVM (seed=42)",
            outpath=RESULTS_DIR / "cm_pipelineB_seed42.png",
            n_classes=n_classes
        )

print("‚úÖ Pipeline B runs:", len(PIPE_B_RUNS))

## Comparison ‚Äî tables + extra graphs

In [None]:
# CELL 11 ‚Äî Build comparison tables + save CSVs

import pandas as pd

df_a = pd.DataFrame(PIPE_A_RUNS)
df_b = pd.DataFrame(PIPE_B_RUNS)
df = pd.concat([df_a, df_b], ignore_index=True)

display(df[["pipeline","seed","acc","macro_f1","train_s","infer_ms_per_img","best_params"]])

summary = df.groupby("pipeline").agg(
    acc_mean=("acc","mean"),
    acc_std=("acc","std"),
    f1_mean=("macro_f1","mean"),
    f1_std=("macro_f1","std"),
    train_s_mean=("train_s","mean"),
    train_s_std=("train_s","std"),
    infer_ms_mean=("infer_ms_per_img","mean"),
    infer_ms_std=("infer_ms_per_img","std"),
).reset_index()

display(summary)

df.to_csv(RESULTS_DIR / "runs_all_seeds.csv", index=False)
summary.to_csv(RESULTS_DIR / "summary_mean_std.csv", index=False)
print("Saved CSVs to:", RESULTS_DIR)

In [None]:
# CELL 12 ‚Äî Extra comparison graphs (per-seed bars + boxplots)

seed_order = SEEDS

def plot_seed_bars(metric, title, ylabel, outname):
    pivot = df.pivot(index="seed", columns="pipeline", values=metric).loc[seed_order]
    ax = pivot.plot(kind="bar", figsize=(9,4))
    ax.set_title(title)
    ax.set_xlabel("Seed")
    ax.set_ylabel(ylabel)
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / outname, dpi=200)
    plt.show()
    print("Saved:", RESULTS_DIR / outname)

plot_seed_bars("acc", "Accuracy by seed", "Accuracy", "acc_by_seed.png")
plot_seed_bars("macro_f1", "Macro F1 by seed", "Macro F1", "f1_by_seed.png")
plot_seed_bars("train_s", "Training time by seed", "Seconds", "train_time_by_seed.png")
plot_seed_bars("infer_ms_per_img", "Inference time by seed", "ms / image", "infer_time_by_seed.png")

plt.figure(figsize=(8,4))
df.boxplot(column="acc", by="pipeline")
plt.title("Accuracy distribution across seeds")
plt.suptitle("")
plt.ylabel("Accuracy")
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig(RESULTS_DIR / "acc_boxplot.png", dpi=200)
plt.show()
print("Saved:", RESULTS_DIR / "acc_boxplot.png")

plt.figure(figsize=(8,4))
df.boxplot(column="macro_f1", by="pipeline")
plt.title("Macro F1 distribution across seeds")
plt.suptitle("")
plt.ylabel("Macro F1")
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig(RESULTS_DIR / "f1_boxplot.png", dpi=200)
plt.show()
print("Saved:", RESULTS_DIR / "f1_boxplot.png")

In [None]:
# CELL 13 ‚Äî Per-class accuracy plot (seed=42) for both pipelines

assert PIPE_A_CM_SEED42 is not None and PIPE_B_CM_SEED42 is not None

accA = per_class_accuracy(PIPE_A_CM_SEED42)
accB = per_class_accuracy(PIPE_B_CM_SEED42)

x = np.arange(len(accA))
plt.figure(figsize=(12,4))
plt.plot(x, accA, marker="o", linewidth=1, label="Pipeline A")
plt.plot(x, accB, marker="o", linewidth=1, label="Pipeline B")
plt.ylim(0, 1.05)
plt.xlabel("Class (subject index 0..39)")
plt.ylabel("Per-class accuracy (seed=42)")
plt.title("Per-class accuracy comparison (seed=42)")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.savefig(RESULTS_DIR / "per_class_accuracy_seed42.png", dpi=200)
plt.show()

print("Saved:", RESULTS_DIR / "per_class_accuracy_seed42.png")

In [None]:
# CELL 14 ‚Äî Winner decision

best = summary.sort_values(["f1_mean","acc_mean"], ascending=False).iloc[0]
print("üèÜ Winner:", best["pipeline"])
print(best)

print("\nConclusion:")
print(f"- Recommended deployment pipeline: {best['pipeline']}")
print("- Pipeline A is the classical baseline (PCA/Eigenfaces-style).")
print("- Pipeline B uses transfer learning embeddings (feature extractor) for robustness and higher accuracy.")

In [None]:
# CELL 15 ‚Äî Zip results + download

zip_path = Path("/content/results.zip")
if zip_path.exists():
    zip_path.unlink()

shutil.make_archive("/content/results", "zip", RESULTS_DIR)
print("Created:", zip_path)

from google.colab import files
files.download(str(zip_path))

## Optional: FaceNet embeddings (advanced, off by default)

Your Colab runtime may use very new Torch/Torchvision versions; some FaceNet packages can break on these.
This optional cell is written so it **won‚Äôt crash** your notebook if FaceNet fails.

If FaceNet import fails, keep Pipeline B (ResNet18) ‚Äî it‚Äôs already very strong on ORL.


In [None]:
# OPTIONAL CELL ‚Äî FaceNet embeddings (run only if you want to experiment)

USE_FACENET = False

if USE_FACENET:
    try:
        !pip -q install facenet-pytorch
        from facenet_pytorch import InceptionResnetV1, fixed_image_standardization
        import torchvision.transforms as T
        from PIL import Image
        import torch

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        facenet = InceptionResnetV1(pretrained="vggface2").eval().to(device)

        EMB_SIZE = 160
        facenet_tf = T.Compose([
            T.Resize((EMB_SIZE, EMB_SIZE)),
            T.ToTensor(),
            fixed_image_standardization
        ])

        def facenet_embed(X_gray_uint8: np.ndarray, batch_size=32):
            embs = []
            with torch.no_grad():
                for start in range(0, len(X_gray_uint8), batch_size):
                    batch = X_gray_uint8[start:start+batch_size]
                    bt = torch.stack([facenet_tf(Image.fromarray(im).convert("RGB")) for im in batch], dim=0).to(device)
                    out = facenet(bt)
                    out = torch.nn.functional.normalize(out, p=2, dim=1)
                    embs.append(out.cpu().numpy().astype(np.float32))
            return np.vstack(embs)

        X_emb_facenet = facenet_embed(X_img, batch_size=32)
        print("‚úÖ FaceNet embeddings:", X_emb_facenet.shape)
        print("Next: reuse Pipeline B SVM code with X_emb_facenet instead of X_emb.")
    except Exception as e:
        print("‚ùå FaceNet failed on this runtime. Keep the stable ResNet18 pipeline.")
        print("Error:", repr(e))