In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from google.colab import drive

drive.mount('/content/drive')


# Define the transforms for data augmentation
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, .2565, 0.2761])
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, .2565, 0.2761])
])

# Load the CIFAR-100 dataset
trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)

testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = nn.ReLU()(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = nn.ReLU()(out)
        return out

class ResNet32(nn.Module):
    def __init__(self, num_classes=100):
        super(ResNet32, self).__init__()
        self.in_planes = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(16, 2, stride=1)
        self.layer2 = self._make_layer(32, 2, stride=2)
        self.layer3 = self._make_layer(64, 2, stride=2)
        self.linear = nn.Linear(64, num_classes)

    def _make_layer(self, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(BasicBlock(self.in_planes, planes, stride))
            self.in_planes = planes * BasicBlock.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = nn.ReLU()(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = nn.AvgPool2d(8)(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

# Initialize the model and optimizer
net = ResNet32()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

# Train the model
for epoch in range(60):
    net.train()
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('[Epoch %d] loss: %.3f' % (epoch + 1, running_loss / len(trainloader)))

    # Evaluate the model on the test set
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('[Epoch %d] test accuracy: %.3f%%' % (epoch + 1, 100 * correct / total))

    torch.save(net.state_dict(), '/content/drive/MyDrive/Teacher32_Cifar100_final_weights.pth')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files already downloaded and verified
Files already downloaded and verified
[Epoch 1] loss: 3.977
[Epoch 1] test accuracy: 11.350%
[Epoch 2] loss: 3.442
[Epoch 2] test accuracy: 18.540%
[Epoch 3] loss: 2.962
[Epoch 3] test accuracy: 26.200%
[Epoch 4] loss: 2.648
[Epoch 4] test accuracy: 28.370%
[Epoch 5] loss: 2.435
[Epoch 5] test accuracy: 33.500%
[Epoch 6] loss: 2.281
[Epoch 6] test accuracy: 32.480%
[Epoch 7] loss: 2.188
[Epoch 7] test accuracy: 36.720%
[Epoch 8] loss: 2.098
[Epoch 8] test accuracy: 39.020%
[Epoch 9] loss: 2.033
[Epoch 9] test accuracy: 41.560%
[Epoch 10] loss: 1.981
[Epoch 10] test accuracy: 39.150%
[Epoch 11] loss: 1.943
[Epoch 11] test accuracy: 36.840%
[Epoch 12] loss: 1.914
[Epoch 12] test accuracy: 39.330%
[Epoch 13] loss: 1.888
[Epoch 13] test accuracy: 44.300%
[Epoch 14] loss: 1.856
[Epoch 14] test accuracy: 41.010%
[Epoch 15] loss

In [None]:
# Defining the teacher and student and TA models

class BasicBlock_32(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock_32, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = nn.ReLU()(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = nn.ReLU()(out)
        return out

class ResNet32(nn.Module):
    def __init__(self, num_classes=100):
        super(ResNet32, self).__init__()
        self.in_planes = 16

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(16, 2, stride=1)
        self.layer2 = self._make_layer(32, 2, stride=2)
        self.layer3 = self._make_layer(64, 2, stride=2)
        self.linear = nn.Linear(64, num_classes)

    def _make_layer(self, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(BasicBlock_32(self.in_planes, planes, stride))
            self.in_planes = planes * BasicBlock_32.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = nn.ReLU()(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = nn.AvgPool2d(8)(out)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.stride = stride

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = nn.Sequential()

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        out += self.shortcut(residual)
        out = self.relu(out)

        return out

class ResNet20(nn.Module):
    def __init__(self, num_classes=100):
        super(ResNet20, self).__init__()
        self.in_channels = 16
        self.conv = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(BasicBlock, 16, 3, stride=1)
        self.layer2 = self.make_layer(BasicBlock, 32, 3, stride=2)
        self.layer3 = self.make_layer(BasicBlock, 64, 3, stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, num_classes)

    def make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)

        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)

        return out



In [None]:
from torch.utils.data import random_split

# Load the datasets
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276))
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441), (0.267, 0.256, 0.276))
])

# Load the CIFAR10 dataset
cifar100_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=train_transform)

# Calculate the lengths of the training and testing datasets
train_length = int(len(cifar100_dataset) * 0.8)
test_length = len(cifar100_dataset) - train_length

# Split the CIFAR10 dataset into training and testing datasets
train_dataset, test_dataset = random_split(cifar100_dataset, [train_length, test_length])

# Apply the test_transform to the test_dataset
test_dataset = test_dataset.dataset
test_dataset.transform = test_transform

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:15<00:00, 10624845.56it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data


In [None]:
# Define the loss function, optimizer, and hyperparameters
teacher = ResNet32()
TA=ResNet20()
#student = ResNet8()

"""
M=ResNet(block, num_blocks)
TA=M.ResNet20()
student=M.Resnet8()
"""

from google.colab import drive

drive.mount('/content/drive')

# Load pre-trained teacher model weights
#teacher.resnet50.load_state_dict(torch.load('/content/drive/MyDrive/resnet50_cifar10.pth'), False)
teacher.load_state_dict(torch.load('/content/drive/MyDrive/Teacher32_Cifar100_final_weights.pth'), False)

criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.SGD(TA.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
teacher.eval()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ResNet32(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock_32(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock_32(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1,

In [None]:
#train TA model, knowledge distilation from teacher model


temperature = 3.3
num_epochs = 30

# Train the student model using knowledge distillation


teacher.eval()

for epoch in range(num_epochs):
    TA.train()
    train_loss = 0.0
    train_correct = 0

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Forward pass for teacher and student models
        with torch.no_grad():
            teacher_output = teacher(images)
        TA_output = TA(images)

        # Apply temperature scaling to logits
        teacher_output = teacher_output / temperature
        TA_output = TA_output / temperature

        # Calculate the distillation loss
        loss = criterion(nn.functional.log_softmax(TA_output, dim=1), nn.functional.softmax(teacher_output, dim=1))*(temperature**2)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Calculate training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(TA_output.data, 1)
        train_correct += (predicted == labels).sum().item()

    scheduler.step()

    # Evaluate the student model on the test set
    TA.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for images, labels in test_loader:
            TA_output = TA(images)
            loss = nn.functional.cross_entropy(TA_output, labels)
            test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(TA_output.data, 1)
            test_correct += (predicted == labels).sum().item()

    # Print epoch number, training loss, training accuracy, test loss, and test accuracy
    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss/len(train_dataset), 100*train_correct/len(train_dataset),
                  test_loss/len(test_dataset), 100*test_correct/len(test_dataset)))

# Save the student model weights
torch.save(TA.state_dict(), '/content/drive/MyDrive/TA_ResNet20_Cifar100_weights.pth')


Epoch [1/30], Train Loss: 5.1650, Train Acc: 7.09%, Test Loss: 4.1798, Test Acc: 10.02%
Epoch [2/30], Train Loss: 3.3149, Train Acc: 15.63%, Test Loss: 3.5676, Test Acc: 18.78%
Epoch [3/30], Train Loss: 2.3336, Train Acc: 23.71%, Test Loss: 3.1611, Test Acc: 24.09%
Epoch [4/30], Train Loss: 1.7625, Train Acc: 29.60%, Test Loss: 2.6766, Test Acc: 31.52%
Epoch [5/30], Train Loss: 1.4419, Train Acc: 33.68%, Test Loss: 2.7749, Test Acc: 32.14%
Epoch [6/30], Train Loss: 1.2116, Train Acc: 36.93%, Test Loss: 2.3883, Test Acc: 37.73%
Epoch [7/30], Train Loss: 1.0623, Train Acc: 39.16%, Test Loss: 2.3530, Test Acc: 38.73%
Epoch [8/30], Train Loss: 0.9589, Train Acc: 40.79%, Test Loss: 2.2348, Test Acc: 40.67%
Epoch [9/30], Train Loss: 0.8774, Train Acc: 41.88%, Test Loss: 2.1780, Test Acc: 41.91%
Epoch [10/30], Train Loss: 0.8090, Train Acc: 43.30%, Test Loss: 2.1008, Test Acc: 43.52%
Epoch [11/30], Train Loss: 0.7578, Train Acc: 44.17%, Test Loss: 2.1999, Test Acc: 41.60%
Epoch [12/30], Train

In [None]:
#without replacing the fc of student

drive.mount('/content/drive')
TA.load_state_dict(torch.load('/content/drive/MyDrive/TA_ResNet20_Cifar100_weights.pth'), False)
TA.eval()


class ResNet8(nn.Module):
    def __init__(self, num_classes=100):
        super(ResNet8, self).__init__()
        self.in_channels = 16
        self.conv = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(BasicBlock, 16, 1, stride=1)
        self.layer2 = self.make_layer(BasicBlock, 32, 1, stride=2)
        self.layer3 = self.make_layer(BasicBlock, 64, 1, stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, num_classes)

    def make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)

        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


student = ResNet8()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Without replacing fc of student, train for loss*temperature**2


criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.SGD(student.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

temperature = 3.3
num_epochs = 40


for epoch in range(num_epochs):
    student.train()
    train_loss = 0.0
    train_correct = 0

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        with torch.no_grad():
           TA_output = TA(images)
        student_output = student(images)

        TA_output = TA_output / temperature
        student_output = student_output / temperature

        loss = criterion(nn.functional.log_softmax(student_output, dim=1), nn.functional.softmax(TA_output, dim=1))*(temperature**2)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(student_output.data, 1)
        train_correct += (predicted == labels).sum().item()

    scheduler.step()
    student.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for images, labels in test_loader:
            student_output = student(images)
            loss = nn.functional.cross_entropy(student_output, labels)
            test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(student_output.data, 1)
            test_correct += (predicted == labels).sum().item()

    # Print epoch number, training loss, training accuracy, test loss, and test accuracy
    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss/len(train_dataset), 100*train_correct/len(train_dataset),
                  test_loss/len(test_dataset), 100*test_correct/len(test_dataset)))




# Save the student model weights
torch.save(student.state_dict(), '/content/drive/MyDrive/student_cifar100_fc_weights.pth')



Epoch [1/40], Train Loss: 4.2591, Train Acc: 8.58%, Test Loss: 4.1141, Test Acc: 12.68%
Epoch [2/40], Train Loss: 2.4071, Train Acc: 18.68%, Test Loss: 3.4664, Test Acc: 21.95%
Epoch [3/40], Train Loss: 1.6769, Train Acc: 25.21%, Test Loss: 3.0782, Test Acc: 25.99%
Epoch [4/40], Train Loss: 1.3363, Train Acc: 29.04%, Test Loss: 3.1960, Test Acc: 25.75%
Epoch [5/40], Train Loss: 1.1400, Train Acc: 31.77%, Test Loss: 3.1586, Test Acc: 27.43%
Epoch [6/40], Train Loss: 1.0293, Train Acc: 33.19%, Test Loss: 2.5180, Test Acc: 34.88%
Epoch [7/40], Train Loss: 0.9450, Train Acc: 34.64%, Test Loss: 2.7616, Test Acc: 32.01%
Epoch [8/40], Train Loss: 0.8962, Train Acc: 35.01%, Test Loss: 2.4855, Test Acc: 34.84%
Epoch [9/40], Train Loss: 0.8583, Train Acc: 35.74%, Test Loss: 2.6226, Test Acc: 33.80%
Epoch [10/40], Train Loss: 0.8274, Train Acc: 36.40%, Test Loss: 2.9713, Test Acc: 31.56%
Epoch [11/40], Train Loss: 0.8120, Train Acc: 36.37%, Test Loss: 2.8304, Test Acc: 32.47%
Epoch [12/40], Train

In [None]:
#without multiplying loss*temperature**2 and without replacing student fc as well


criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.SGD(student.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

temperature = 3.3
num_epochs = 40

for epoch in range(num_epochs):
    student.train()
    train_loss = 0.0
    train_correct = 0

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Forward pass for teacher and student models
        with torch.no_grad():
           TA_output = TA(images)
        student_output = student(images)

        # Apply temperature scaling to logits
        TA_output = TA_output / temperature
        student_output = student_output / temperature

        loss = criterion(nn.functional.log_softmax(student_output, dim=1), nn.functional.softmax(TA_output, dim=1))

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(student_output.data, 1)
        train_correct += (predicted == labels).sum().item()

    scheduler.step()

    student.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for images, labels in test_loader:
            student_output = student(images)
            loss = nn.functional.cross_entropy(student_output, labels)
            test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(student_output.data, 1)
            test_correct += (predicted == labels).sum().item()

    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss/len(train_dataset), 100*train_correct/len(train_dataset),
                  test_loss/len(test_dataset), 100*test_correct/len(test_dataset)))




# Save the student model weights
torch.save(student.state_dict(), '/content/drive/MyDrive/student_cifar100_fc_weights.pth')



Epoch [1/40], Train Loss: 0.4853, Train Acc: 4.65%, Test Loss: 4.2361, Test Acc: 6.71%
Epoch [2/40], Train Loss: 0.3484, Train Acc: 8.46%, Test Loss: 3.9424, Test Acc: 10.57%
Epoch [3/40], Train Loss: 0.2836, Train Acc: 11.89%, Test Loss: 3.6119, Test Acc: 13.28%
Epoch [4/40], Train Loss: 0.2421, Train Acc: 14.61%, Test Loss: 3.6146, Test Acc: 15.13%
Epoch [5/40], Train Loss: 0.2108, Train Acc: 17.31%, Test Loss: 3.8683, Test Acc: 15.54%
Epoch [6/40], Train Loss: 0.1850, Train Acc: 19.72%, Test Loss: 3.2256, Test Acc: 20.05%
Epoch [7/40], Train Loss: 0.1671, Train Acc: 21.62%, Test Loss: 3.3436, Test Acc: 20.65%
Epoch [8/40], Train Loss: 0.1524, Train Acc: 23.16%, Test Loss: 3.1655, Test Acc: 21.39%
Epoch [9/40], Train Loss: 0.1432, Train Acc: 24.54%, Test Loss: 3.3051, Test Acc: 22.16%
Epoch [10/40], Train Loss: 0.1360, Train Acc: 25.56%, Test Loss: 3.1325, Test Acc: 23.95%
Epoch [11/40], Train Loss: 0.1294, Train Acc: 26.22%, Test Loss: 3.0319, Test Acc: 24.38%
Epoch [12/40], Train L

In [None]:
#Replace the student's fully connected network by teacher's liniar layer(64,100)


drive.mount('/content/drive')
# Load trained TA model weights
TA.load_state_dict(torch.load('/content/drive/MyDrive/TA_ResNet20_Cifar100_weights.pth'), False)

TA.eval()


class ResNet8(nn.Module):
    def __init__(self, num_classes=100):
        super(ResNet8, self).__init__()
        self.in_channels = 16
        self.conv = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(BasicBlock, 16, 1, stride=1)
        self.layer2 = self.make_layer(BasicBlock, 32, 1, stride=2)
        self.layer3 = self.make_layer(BasicBlock, 64, 1, stride=2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))


        student_fc_layer=nn.Linear(64, 100)
        student_fc_layer.weight.data = teacher.linear.weight.data
        student_fc_layer.bias.data = teacher.linear.bias.data
        # Replace
        self.fc = student_fc_layer


    def make_layer(self, block, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)

        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)

        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


student = ResNet8()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Train student after Replacing fc and without loss*temperature**2

criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.SGD(student.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

temperature = 3.3
num_epochs = 40


for epoch in range(num_epochs):
    student.train()
    train_loss = 0.0
    train_correct = 0

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Forward pass for teacher and student models
        with torch.no_grad():
           TA_output = TA(images)
        student_output = student(images)

        # Apply temperature scaling to logits
        TA_output = TA_output / temperature
        student_output = student_output / temperature

        # Calculate the distillation loss
        loss = criterion(nn.functional.log_softmax(student_output, dim=1), nn.functional.softmax(TA_output, dim=1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Calculate training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(student_output.data, 1)
        train_correct += (predicted == labels).sum().item()

    scheduler.step()

    # Evaluate the student model on the test set
    student.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for images, labels in test_loader:
            student_output = student(images)
            loss = nn.functional.cross_entropy(student_output, labels)
            test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(student_output.data, 1)
            test_correct += (predicted == labels).sum().item()

    # Print epoch number, training loss, training accuracy, test loss, and test accuracy
    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss/len(train_dataset), 100*train_correct/len(train_dataset),
                  test_loss/len(test_dataset), 100*test_correct/len(test_dataset)))




# Save the student model weights
torch.save(student.state_dict(), '/content/drive/MyDrive/student_cifar100_fc_weights.pth')



Epoch [1/40], Train Loss: 0.3581, Train Acc: 10.91%, Test Loss: 4.3687, Test Acc: 12.32%
Epoch [2/40], Train Loss: 0.2344, Train Acc: 18.12%, Test Loss: 3.5091, Test Acc: 18.72%
Epoch [3/40], Train Loss: 0.1815, Train Acc: 22.57%, Test Loss: 3.6278, Test Acc: 20.30%
Epoch [4/40], Train Loss: 0.1538, Train Acc: 25.19%, Test Loss: 3.3707, Test Acc: 21.87%
Epoch [5/40], Train Loss: 0.1365, Train Acc: 26.64%, Test Loss: 3.0284, Test Acc: 25.64%
Epoch [6/40], Train Loss: 0.1272, Train Acc: 27.66%, Test Loss: 3.1678, Test Acc: 24.59%
Epoch [7/40], Train Loss: 0.1209, Train Acc: 28.35%, Test Loss: 3.0752, Test Acc: 26.12%
Epoch [8/40], Train Loss: 0.1181, Train Acc: 28.52%, Test Loss: 3.1218, Test Acc: 25.65%
Epoch [9/40], Train Loss: 0.1146, Train Acc: 28.56%, Test Loss: 3.1547, Test Acc: 26.03%
Epoch [10/40], Train Loss: 0.1123, Train Acc: 29.05%, Test Loss: 3.3496, Test Acc: 22.77%
Epoch [11/40], Train Loss: 0.1112, Train Acc: 29.06%, Test Loss: 3.1527, Test Acc: 23.44%
Epoch [12/40], Trai

In [None]:
#Train student for replacing fc and loss*temperature**2 with NLL (Negative Log Loss)

criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.SGD(student.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

temperature = 3.16
num_epochs = 40

for epoch in range(num_epochs):
    student.train()
    train_loss = 0.0
    train_correct = 0
    alpha = 0.5

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        # Forward pass for teacher and student models
        with torch.no_grad():
           TA_output = TA(images)
        student_output = student(images)

        # Apply temperature scaling to logits
        TA_output = TA_output / temperature
        student_output = student_output / temperature

        # Calculate the distillation loss
        distillation_loss = criterion(nn.functional.log_softmax(student_output, dim=1), nn.functional.softmax(TA_output, dim=1))*(temperature**2)

        # Calculate the negative log likelihood loss
        nll_loss = nn.functional.nll_loss(nn.functional.log_softmax(TA_output, dim=1), labels)


        # Combine the two losses using the weight alpha
        loss = alpha * nll_loss + (1 - alpha) * distillation_loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Calculate training loss and accuracy
        train_loss += loss.item() * images.size(0)
        _, predicted = torch.max(student_output.data, 1)
        train_correct += (predicted == labels).sum().item()

    scheduler.step()

    # Evaluate the student model on the test set
    student.eval()
    test_loss = 0.0
    test_correct = 0

    with torch.no_grad():
        for images, labels in test_loader:
            student_output = student(images)
            loss = nn.functional.cross_entropy(student_output, labels)
            test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(student_output.data, 1)
            test_correct += (predicted == labels).sum().item()

    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'
          .format(epoch+1, num_epochs, train_loss/len(train_dataset), 100*train_correct/len(train_dataset),
                  test_loss/len(test_dataset), 100*test_correct/len(test_dataset)))








Epoch [1/40], Train Loss: 0.5111, Train Acc: 41.52%, Test Loss: 2.1876, Test Acc: 41.63%
Epoch [2/40], Train Loss: 0.5315, Train Acc: 41.05%, Test Loss: 2.4883, Test Acc: 37.52%
Epoch [3/40], Train Loss: 0.5670, Train Acc: 40.52%, Test Loss: 2.7210, Test Acc: 33.27%
Epoch [4/40], Train Loss: 0.6028, Train Acc: 39.81%, Test Loss: 2.3628, Test Acc: 37.87%
Epoch [5/40], Train Loss: 0.6140, Train Acc: 39.41%, Test Loss: 2.9370, Test Acc: 32.47%
Epoch [6/40], Train Loss: 0.6327, Train Acc: 39.34%, Test Loss: 2.5183, Test Acc: 36.93%
Epoch [7/40], Train Loss: 0.6384, Train Acc: 39.15%, Test Loss: 2.3909, Test Acc: 38.13%
Epoch [8/40], Train Loss: 0.6432, Train Acc: 39.15%, Test Loss: 2.4354, Test Acc: 37.88%
Epoch [9/40], Train Loss: 0.6482, Train Acc: 39.02%, Test Loss: 2.4595, Test Acc: 36.19%
Epoch [10/40], Train Loss: 0.6559, Train Acc: 38.90%, Test Loss: 2.3300, Test Acc: 38.22%
Epoch [11/40], Train Loss: 0.6482, Train Acc: 38.75%, Test Loss: 2.5488, Test Acc: 35.97%
Epoch [12/40], Trai