Experiment Setting:
1. Train with a large neural net with 2 hidden layers of 1200 rectified linear hidden units.
2. Distill the knowledge from neural net above to a two-layer distilled net with 800 rectified linear hidden units(at the temperature of 20).

In [2]:
# %%writefile teacher.py
# %load teacher.py
from __future__ import print_function
from __future__ import division

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data
from torchvision import datasets, transforms
from torch.autograd import Variable

model = None
optimizer = None
epochs = 20
batch_size = 128
lr = 1e-4

class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.n_inputs = 28 * 28
        self.n_layer_1 = 1200
        self.n_layer_2 = 1200
        self.n_classes = 10
        self.drop = nn.Dropout(0.5)
        self.affine1 = nn.Linear(self.n_inputs, self.n_layer_1)
        self.affine2 = nn.Linear(self.n_layer_1, self.n_layer_2)
        self.affine3 = nn.Linear(self.n_layer_2, self.n_classes)
    
    def forward(self, x):
        x = x.view(-1, self.n_inputs)
        out1 = self.drop(F.relu(self.affine1(x)))
        out2 = self.drop(F.relu(self.affine2(out1)))
        out3 = self.affine3(out2)
        return out3
    
def train():
    model.train()
    for epoch in xrange(epochs):
        avg_loss = 0
        n_batches = len(train_loader)
        for batch_idx, (data, label) in enumerate(train_loader):
            data, label = data.cuda(), label.cuda()
            data, label = Variable(data), Variable(label)
            optimizer.zero_grad()
            output = F.log_softmax(model(data))
            loss = F.nll_loss(output, label)
            loss.backward()
            optimizer.step()
            avg_loss += loss.data[0]
        avg_loss /= n_batches
        print(avg_loss)

def test():
    model.eval()
    correct = 0
    for data, label in test_loader:
        data, label = data.cuda(), label.cuda()
        data, label = Variable(data, volatile=True), Variable(label)
        output = F.log_softmax(model(data))
        pred = output.data.max(1)[1]
        correct += pred.eq(label.data.view_as(pred)).cpu().sum()
    
    print(100. * correct / len(test_loader.dataset))

    
if __name__ == "__main__":
    model = Teacher()
    model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    kwargs = {'num_workers': 1, 'pin_memory': True}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=batch_size, shuffle=True, **kwargs)
    train()
    with open('teacher.params', 'wb') as f:
        torch.save(model, f)
    test()
    

Overwriting teacher.py


In [4]:
# %%writefile student.py
# %load student.py
from __future__ import print_function
from __future__ import division

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data
from torchvision import datasets, transforms
from torch.autograd import Variable

class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.n_inputs = 28 * 28
        self.n_layer_1 = 800
        self.n_layer_2 = 800
        self.n_classes = 10
        self.drop = nn.Dropout(0.5)
        self.affine1 = nn.Linear(self.n_inputs, self.n_layer_1)
        self.affine2 = nn.Linear(self.n_layer_1, self.n_layer_2)
        self.affine3 = nn.Linear(self.n_layer_2, self.n_classes)
    
    def forward(self, x):
        x = x.view(-1, self.n_inputs)
        out1 = self.drop(F.relu(self.affine1(x)))
        out2 = self.drop(F.relu(self.affine2(out1)))
        out3 = self.affine3(out2)
        return out3

Overwriting student.py


In [5]:
# %%writefile teacher-student.py
# %load teacher-student.py
from __future__ import print_function
from __future__ import division

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data
import numpy as np
from torchvision import datasets, transforms
from torch.autograd import Variable
from student import *
from teacher import *

teacher = None
student = None
epochs = 5
batch_size = 128
lr = 1e-3

optimizer = None

def train():
    T = 20
    student.train()
    for epoch in xrange(epochs):
        avg_loss = 0
        n_batches = len(transfer_loader)
        for data, label in transfer_loader:
            data, label = data.cuda(), label.cuda()
            data, label = Variable(data), Variable(label)
            optimizer.zero_grad()
            output_teacher = F.softmax(teacher(data) / T)
            output_student = F.softmax(student(data) / T)
            loss = F.binary_cross_entropy(output_student, output_teacher)
            loss.backward()
            optimizer.step()
            avg_loss += loss.data[0]
        avg_loss /= n_batches
        print(avg_loss)
        
    
def test():
    T = 1
    student.eval()
    correct = 0
    for data, label in test_loader:
        data, label = data.cuda(), label.cuda()
        data, label = Variable(data, volatile=True), Variable(label)
        output = F.log_softmax(student(data) / T)
        pred = output.data.max(1)[1]
        correct += pred.eq(label.data.view_as(pred)).cpu().sum()
    
    print(100. * correct / len(test_loader.dataset))

student = Student()
student.cuda()
with open('teacher.params', 'rb') as f:
    teacher = torch.load(f)

optimizer = optim.Adam(student.parameters(), lr=lr)

train_set = datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))

transfer_data, transfer_labels = [], []
for data, label in train_set:
    if label != 3:
        transfer_data.append(data.tolist())
        transfer_labels.append(label)
        
transfer_data, transfer_labels = torch.Tensor(transfer_data), torch.Tensor(transfer_labels)

kwargs = {'num_workers': 1, 'pin_memory': True}
transfer_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(transfer_data, transfer_labels),
        batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,))
                ])),
        batch_size=batch_size, shuffle=True, **kwargs)

print(teacher)

teacher.eval()
train()
test()

Teacher (
  (drop): Dropout (p = 0.5)
  (affine1): Linear (784 -> 1200)
  (affine2): Linear (1200 -> 1200)
  (affine3): Linear (1200 -> 10)
)
0.316319770031
0.315715478534
0.315667766149
0.315655072624
0.315634672758
95.51
