In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))


CUDA available: True
Tesla T4


## 1) Imports and seeds

In [3]:
import os, random, json, math, time
from pathlib import Path
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import pandas as pd
import matplotlib.pyplot as plt


USE_AMP = True
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ROOT = Path('/content/ATML_A3')
CKPTS = ROOT / 'ckpts'
RES   = ROOT / 'results'
FIGS  = ROOT / 'figures'
for p in [CKPTS, RES, FIGS]:
    p.mkdir(parents=True, exist_ok=True)

def set_seed(seed=1337):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(1337)
print(f"Device: {DEVICE}")


Device: cuda


## 2) Data: CIFAR-100

In [4]:
def get_cifar100(batch_size=128, num_workers=2):
    mean = (0.5071, 0.4867, 0.4408)
    std  = (0.2675, 0.2565, 0.2761)
    train_tf = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    test_tf  = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)])

    train_ds = datasets.CIFAR100(root=str(ROOT/'data'), train=True,  download=True, transform=train_tf)
    val_ds   = datasets.CIFAR100(root=str(ROOT/'data'), train=False, download=True, transform=test_tf)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=num_workers, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    return train_loader, val_loader

train_loader, val_loader = get_cifar100()


100%|██████████| 169M/169M [00:02<00:00, 73.9MB/s]


## 3) Models: VGG-16/19 (teacher), VGG-11 (student)

In [5]:
def make_vgg(name='vgg16', num_classes=100, pretrained=False):
    if name == 'vgg16':
        net = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1 if pretrained else None)
    elif name == 'vgg19':
        net = models.vgg19(weights=models.VGG19_Weights.IMAGENET1K_V1 if pretrained else None)
    elif name == 'vgg11':
        net = models.vgg11(weights=None)
    else:
        raise ValueError('Unsupported VGG: ' + name)
    in_feats = net.classifier[-1].in_features
    net.classifier[-1] = nn.Linear(in_feats, num_classes)
    return net.to(DEVICE)

teacher16 = make_vgg('vgg16', pretrained=True)
student11 = make_vgg('vgg11', pretrained=False)
print("Built VGG-16 (teacher) & VGG-11 (student).")


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


100%|██████████| 528M/528M [00:06<00:00, 79.6MB/s]


Built VGG-16 (teacher) & VGG-11 (student).


4) Losses: CE, Label Smoothing, KD-LM, DKD

In [6]:
class LabelSmoothingCE(nn.Module):
    def __init__(self, eps=0.1):
        super().__init__()
        self.eps = eps
        self.logsoft = nn.LogSoftmax(dim=1)
    def forward(self, logits, targets):
        n = logits.size(1)
        logp = self.logsoft(logits)
        with torch.no_grad():
            dist = torch.zeros_like(logp)
            dist.fill_(self.eps / (n - 1))
            dist.scatter_(1, targets.unsqueeze(1), 1 - self.eps)
        return torch.mean(torch.sum(-dist * logp, dim=1))

def kd_loss_logits(student_logits, teacher_logits, T=4.0):
    # KL(T||S) with temperature scaling (classic Logit Matching)
    p = F.log_softmax(student_logits / T, dim=1)
    q = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p, q, reduction='batchmean') * (T * T)

class DKDLoss(nn.Module):
    # Minimal DKD: decouples target vs non-target parts
    def __init__(self, alpha=1.0, beta=8.0, T=4.0):
        super().__init__()
        self.alpha, self.beta, self.T = alpha, beta, T
    def forward(self, s_logits, t_logits, targets):
        T = self.T
        s = F.log_softmax(s_logits / T, dim=1)
        t = F.softmax(t_logits / T, dim=1)
        one_hot = F.one_hot(targets, num_classes=s_logits.size(1)).float()
        pos_loss = F.kl_div((s * one_hot).sum(1, keepdim=True),
                            (t * one_hot).sum(1, keepdim=True), reduction='batchmean')
        neg_loss = F.kl_div((s * (1 - one_hot)), (t * (1 - one_hot)), reduction='batchmean')
        return (self.alpha * pos_loss + self.beta * neg_loss) * (T * T)


## 5) Eval + generic CE training loop

In [7]:
def accuracy_topk(logits, targets, topk=(1,)):
    maxk = max(topk); batch_size = targets.size(0)
    _, pred = logits.topk(maxk, 1, True, True); pred = pred.t()
    correct = pred.eq(targets.view(1, -1).expand_as(pred))
    res=[]
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

def evaluate(model, loader):
    model.eval()
    ce = nn.CrossEntropyLoss()
    loss_sum=0.0; n=0; top1=0.0; top5=0.0
    with torch.no_grad():
        for x,y in loader:
            x=x.to(DEVICE); y=y.to(DEVICE)
            logits = model(x)
            loss = ce(logits, y)
            a1,a5 = accuracy_topk(logits, y, topk=(1,5))
            bs = x.size(0)
            loss_sum += loss.item()*bs; n += bs
            top1 += a1.item()*bs/100.0; top5 += a5.item()*bs/100.0
    return loss_sum/n, 100*top1/n, 100*top5/n

def train_ce(model, train_loader, val_loader, epochs=60, lr=0.1, weight_decay=5e-4, use_ls=False, ls_eps=0.1):
    model.to(DEVICE)
    opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
    ce = LabelSmoothingCE(ls_eps) if use_ls else nn.CrossEntropyLoss()
    best=(1e9,0,0)
    for ep in range(1, epochs+1):
        model.train()
        for x,y in train_loader:
            x=x.to(DEVICE); y=y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                logits = model(x)
                loss = ce(logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
        sched.step()
        vl, a1, a5 = evaluate(model, val_loader)
        if a1>best[1]: best=(vl,a1,a5)
        if ep%10==0 or ep==1:
            print(f"[{'LS-CE' if use_ls else 'CE'}] {ep}/{epochs} | val_loss={vl:.3f} top1={a1:.2f} top5={a5:.2f}")
    return best


## 6) KD-LM training loop

In [8]:
def train_kd_lm(student, teacher, train_loader, val_loader, epochs=60, lr=0.1, alpha=0.5, T=4.0):
    student.to(DEVICE); teacher.eval().to(DEVICE)
    opt = torch.optim.SGD(student.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
    ce = nn.CrossEntropyLoss()
    best=(1e9,0,0)
    for ep in range(1, epochs+1):
        student.train()
        for x,y in train_loader:
            x=x.to(DEVICE); y=y.to(DEVICE)
            with torch.no_grad():
                t_logits = teacher(x)
            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                s_logits = student(x)
                loss = alpha*kd_loss_logits(s_logits, t_logits, T=T) + (1-alpha)*ce(s_logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
        sched.step()
        vl, a1, a5 = evaluate(student, val_loader)
        if a1>best[1]: best=(vl,a1,a5)
        if ep%10==0 or ep==1:
            print(f"[KD-LM] {ep}/{epochs} | val_loss={vl:.3f} top1={a1:.2f} top5={a5:.2f}")
    return best


## 7) DKD training loop

In [9]:
def train_dkd(student, teacher, train_loader, val_loader, epochs=60, lr=0.1, alpha=1.0, beta=8.0, T=4.0):
    student.to(DEVICE); teacher.eval().to(DEVICE)
    dkd = DKDLoss(alpha=alpha, beta=beta, T=T)
    ce = nn.CrossEntropyLoss()
    opt = torch.optim.SGD(student.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
    best=(1e9,0,0)
    for ep in range(1, epochs+1):
        student.train()
        for x,y in train_loader:
            x=x.to(DEVICE); y=y.to(DEVICE)
            with torch.no_grad():
                t_logits = teacher(x)
            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                s_logits = student(x)
                loss = ce(s_logits, y) + dkd(s_logits, t_logits, y)
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
        sched.step()
        vl, a1, a5 = evaluate(student, val_loader)
        if a1>best[1]: best=(vl,a1,a5)
        if ep%10==0 or ep==1:
            print(f"[DKD] {ep}/{epochs} | val_loss={vl:.3f} top1={a1:.2f} top5={a5:.2f}")
    return best


### 8) Save/Load helpers

In [10]:
def save_ckpt(model, path): torch.save({'state_dict': model.state_dict()}, path)
def load_ckpt(model, path): model.load_state_dict(torch.load(path, map_location=DEVICE)['state_dict'])


# 9) Orchestrate Task 3.1 (Teacher → SI → LS → LM → DKD)

In [None]:
EPOCHS = 60  # For quick smoke-test use 30, then increase.

# 1) Train Teacher (VGG-16)
teacher16 = make_vgg('vgg16', pretrained=True)
best_T = train_ce(teacher16, train_loader, val_loader, epochs=EPOCHS, lr=0.1)
save_ckpt(teacher16, str(CKPTS/'teacher_vgg16.pt'))
print("Teacher saved:", best_T)

# 2) Independent Student (VGG-11) baseline
si = make_vgg('vgg11', pretrained=False)
best_SI = train_ce(si, train_loader, val_loader, epochs=EPOCHS, lr=0.1)
save_ckpt(si, str(CKPTS/'student_SI.pt'))
print("SI saved:", best_SI)

# 3) Label Smoothing baseline (VGG-11)
ls = make_vgg('vgg11', pretrained=False)
best_LS = train_ce(ls, train_loader, val_loader, epochs=EPOCHS, lr=0.1, use_ls=True, ls_eps=0.1)
save_ckpt(ls, str(CKPTS/'student_LS.pt'))
print("LS saved:", best_LS)

# 4) KD — Basic Logit Matching (VGG-11 distilled from VGG-16)
lm = make_vgg('vgg11', pretrained=False)
best_LM = train_kd_lm(lm, teacher16, train_loader, val_loader, epochs=EPOCHS, lr=0.1, alpha=0.5, T=4.0)
save_ckpt(lm, str(CKPTS/'student_LM.pt'))
print("LM saved:", best_LM)

# 5) KD — Decoupled KD (VGG-11 distilled from VGG-16)
dkd = make_vgg('vgg11', pretrained=False)
best_DKD = train_dkd(dkd, teacher16, train_loader, val_loader, epochs=EPOCHS, lr=0.1, alpha=1.0, beta=8.0, T=4.0)
save_ckpt(dkd, str(CKPTS/'student_DKD.pt'))
print("DKD saved:", best_DKD)


  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):


[CE] 1/60 | val_loss=4.185 top1=3.94 top5=16.51
[CE] 10/60 | val_loss=3.615 top1=10.73 top5=36.56
[CE] 20/60 | val_loss=3.065 top1=23.00 top5=54.52
[CE] 30/60 | val_loss=2.441 top1=36.78 top5=69.43
[CE] 40/60 | val_loss=1.816 top1=52.38 top5=80.72
[CE] 50/60 | val_loss=1.409 top1=63.54 top5=86.53
[CE] 60/60 | val_loss=1.370 top1=67.15 top5=88.74
Teacher saved: (1.3701172494888305, 67.15, 88.74)
[CE] 1/60 | val_loss=4.607 top1=1.00 top5=5.00
[CE] 10/60 | val_loss=3.714 top1=10.56 top5=35.29
[CE] 20/60 | val_loss=3.116 top1=23.86 top5=52.75
[CE] 30/60 | val_loss=2.496 top1=37.69 top5=67.73
[CE] 40/60 | val_loss=1.896 top1=51.73 top5=78.81
[CE] 50/60 | val_loss=1.503 top1=61.32 top5=85.32
[CE] 60/60 | val_loss=1.510 top1=64.05 top5=87.16


## 10) Quick metrics dump (for your report table/plot later)

In [None]:
def snapshot_metrics(method_name, model):
    vl,a1,a5 = evaluate(model, val_loader)
    return {'method':method_name, 'top1_acc':a1, 'top5_acc':a5, 'val_loss':vl}

rows = []
# Reload to be safe (in case you restart cells later)
T  = make_vgg('vgg16'); load_ckpt(T,  str(CKPTS/'teacher_vgg16.pt'))
SI = make_vgg('vgg11'); load_ckpt(SI, str(CKPTS/'student_SI.pt'))
LS = make_vgg('vgg11'); load_ckpt(LS, str(CKPTS/'student_LS.pt'))
LM = make_vgg('vgg11'); load_ckpt(LM, str(CKPTS/'student_LM.pt'))
DK = make_vgg('vgg11'); load_ckpt(DK, str(CKPTS/'student_DKD.pt'))

for name, m in [('SI',SI), ('LS',LS), ('LM',LM), ('DKD',DK)]:
    rows.append(snapshot_metrics(name, m))

df = pd.DataFrame(rows)
df.to_csv(RES/'task3_part1_metrics.csv', index=False)
print(df)
print("Saved:", RES/'task3_part1_metrics.csv')
