# SMAI Project - Model Compression with Two-stage Multi-teacher Knowledge Distillation for Web Question Answering System

## Group 2 - Synergy

| | |
|- | -|
| Team Members | Roll no |
| Anurag Ghosh | `2022202023` |
| Aryan Gupta | `2022202028` |
| Vedashree Ranade | `2022201073` |

### Step zero: Import Libraries

In [16]:
!pip3 install datasets
!pip3 install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable


In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import datasets
import transformers
import os
from transformers import AutoTokenizer
import torch.nn.functional as F
from transformers import BertModel, BertConfig
import numpy as np

### Step one: Loading Datasets


MNLI Dataset

In [18]:
mnli = datasets.load_dataset('LysandreJik/glue-mnli-train')

train_dataset = mnli['train']
val_test_dataset = mnli['validation']

split_size = len(val_test_dataset) // 2
val_dataset = val_test_dataset.select(range(0, split_size))
test_dataset = val_test_dataset.select(range(split_size, len(val_test_dataset)))

print("train size", len(train_dataset))
print("val size", len(val_dataset))
print("test size", len(test_dataset))

Found cached dataset parquet (/Users/aryangupta/.cache/huggingface/datasets/LysandreJik___parquet/LysandreJik--glue-mnli-train-7ab8f8a28b0cb6f1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

train size 392702
val size 9823
test size 9824


In [19]:
class MNLIDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

### Step two: Finetuning Teacher Model


In [20]:
def get_data(train_dataset, val_dataset, test_dataset, train_size, val_size, test_size, tokenizer):

  train_dataset = train_dataset.shuffle()
  val_dataset = val_dataset.shuffle()
  test_dataset = test_dataset.shuffle()

  # select a subset of the shuffled datasets
  train_subset = train_dataset.select(range(train_size))
  val_subset = val_dataset.select(range(val_size))
  test_subset = test_dataset.select(range(test_size)) 

  # unique, counts = np.unique(test_subset['label'], return_counts=True)
  # dict(zip(unique, counts))

  train_encodings = tokenizer(train_subset['premise'], train_subset['hypothesis'], truncation=True, padding=True)
  val_encodings = tokenizer(val_subset['premise'], val_subset['hypothesis'], truncation=True, padding=True)
  test_encodings = tokenizer(test_subset['premise'], test_subset['hypothesis'], truncation=True, padding=True)

  train_labels = train_subset['label']
  val_labels = val_subset['label']
  test_labels = test_subset['label']

  train_dataset = MNLIDataset(train_encodings, train_labels)
  val_dataset = MNLIDataset(val_encodings, val_labels)
  test_dataset = MNLIDataset(test_encodings, test_labels)

  train_loader = DataLoader(train_dataset, batch_size=16)
  val_loader = DataLoader(val_dataset, batch_size=16)
  test_loader = DataLoader(test_dataset, batch_size=16)

  return train_loader, val_loader, test_loader, train_labels, val_labels, test_labels, train_subset, val_subset, test_subset

def train_model(model, train_loader, optimizer, lr_scheduler):
    model.train()
    correct_predictions = 0
    total_predictions = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        predictions = outputs.logits.argmax(dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += predictions.shape[0]
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    # print("Training completed...")
    
    return correct_predictions / total_predictions
    
def evaluate_model(loader, model):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    logits = np.empty([0, 3])
    i=0
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask)
            batch_logits = outputs.logits.cpu().numpy()
            logits = np.concatenate((logits, batch_logits), axis = 0)
            predictions = outputs.logits.argmax(dim=-1)
            
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += predictions.shape[0]

    return correct_predictions / total_predictions, logits

def fine_tune(teacher_model, train_loader, val_loader, test_loader, lr=5e-5, num_epochs=5, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    optimizer = transformers.AdamW(teacher_model.parameters(), lr=lr)
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = transformers.get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            teacher_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0

    for epoch in range(start_epoch, num_epochs):
        train_acc = train_model(teacher_model, train_loader, optimizer, lr_scheduler)
        val_acc, val_logits = evaluate_model(val_loader, teacher_model)
        print(f'Epoch {epoch + 1}: Train accuracy = {train_acc} Validation accuracy = {val_acc}')

        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': teacher_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': lr_scheduler.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)

    print("Evaluating on training data............")
    train_acc, train_logits = evaluate_model(train_loader, teacher_model)
    print("Evaluating on validation data............")
    val_acc, val_logits = evaluate_model(val_loader, teacher_model)
    print("Evaluating on testing data............")
    test_acc, test_logits = evaluate_model(test_loader, teacher_model)
    print(f"Test accuracy = {test_acc}")
    return train_logits, val_logits, test_logits

Teacher model bert base uncased

In [21]:
tokenizer_base_uncased = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')
teacher_model_1 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

train_loader_1, val_loader_1, test_loader_1, train_labels_1, val_labels_1, test_labels_1, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 220, 220, tokenizer_base_uncased)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
train_logits_1, val_logits_1, test_logits_1 = fine_tune(teacher_model_1, train_loader_1, val_loader_1, test_loader_1, lr=5e-5, num_epochs=5, checkpoint_dir="./checkpoints/", checkpoint_file="teacher_model_1.pth")



Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.55


Teacher model bert large uncased

In [23]:
# tokenizer_large_uncased = transformers.BertTokenizerFast.from_pretrained('bert-large-uncased')
# teacher_model_2 = transformers.BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)

# train_loader_2, val_loader_2, test_loader_2, train_labels_2, val_labels_2, test_labels_2, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 500, 100, 100, tokenizer_large_uncased)

In [24]:
# train_logits_2, val_logits_2, test_logits_2 = fine_tune(teacher_model_2, train_loader_2, val_loader_2, test_loader_2,lr=5e-5, checkpoint_dir="./checkpoints/", checkpoint_file="teacher_model_2.pth")

### Step three: Student Model

In [25]:
class MLP(nn.Module):
    def __init__(self, l):
        super(MLP, self).__init__()
        bert_config = BertConfig(hidden_size=252, num_hidden_layers=2)
        self.bert = BertModel(bert_config)
        self.net1 = nn.Linear(252, 3)
        # self.net4 = nn.Linear(60, 15)
        # self.net5 = nn.Linear(15, 3)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        output = torch.softmax(self.net1(bert_output[:, 0]), dim=1)
        return output

In [66]:
def train(student_model, train_loader, optimizer, criterion, train_logits):
    student_model.train()
    train_loss = 0
    train_correct = 0
    i=0
    for data in train_loader:
        batch_size = len(data['input_ids'])
        optimizer.zero_grad()
        output = student_model(data['input_ids'], data['attention_mask'])
        target = torch.tensor(train_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32)
        loss = criterion(output, torch.softmax(target, dim=1, dtype = torch.float32))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        actual = target.argmax(dim=1, keepdim=True)
        train_correct += pred.eq(actual.view_as(pred)).sum().item()
        i +=1
    train_loss /= len(train_loader.dataset)
    train_acc = train_correct / len(train_loader.dataset)
    return train_loss, train_acc

def test(student_model, test_loader, criterion, test_logits):
    student_model.eval()
    test_loss = 0
    test_correct = 0
    pred_labels = []
    pred_soft_labels = []
    i=0
    with torch.no_grad():
        for data in test_loader:
            batch_size = len(data['input_ids'])
            output = student_model(data['input_ids'], data['attention_mask'])
            target = torch.tensor(test_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32)
            test_loss += criterion(output, torch.softmax(target, dim=1, dtype = torch.float32)).item()
            pred = output.argmax(dim=1, keepdim=True)
            pred_labels.extend(pred.tolist())
            pred_soft_labels.extend(output.max(dim=1)[0].tolist())
            actual = target.argmax(dim=1, keepdim=True)
            test_correct += pred.eq(actual.view_as(pred)).sum().item()
            i += 1
    test_loss /= len(test_loader.dataset)
    test_acc = test_correct / len(test_loader.dataset)
    
    print(len(pred_labels), len(pred_labels[0]))
    return test_loss, test_acc, np.array(pred_labels).reshape(-1,1), np.array(pred_soft_labels).reshape(-1,1)

def train_student(student_model, train_logits, val_logits, test_logits, train_red_loader, val_red_loader, test_red_loader, student_optimizer=None, lr=1e-4, epoch=50, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    if(student_optimizer is None): student_optimizer = torch.optim.Adam(student_model.parameters(), lr)
    criterion = nn.CrossEntropyLoss()
    # epoch = 30
    print("Training...........")
    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            student_model.load_state_dict(checkpoint['model_state_dict'])
            student_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0
    for i in range(start_epoch, epoch):
        train_loss, train_acc = train(student_model, train_red_loader, student_optimizer, criterion, train_logits)
        if(i%10 == 0):   print(f"Epoch {i}   Train loss: {train_loss}    Train accuracy: {train_acc}")
        # print(train_loss, train_acc)
        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': student_model.state_dict(),
                'optimizer_state_dict': student_optimizer.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)
    
    print("Evaluating on training data..................")
    train_loss, train_acc, train_pred_labels, train_pred_soft_labels = test(student_model, train_red_loader, criterion, train_logits)
    print(f"Training loss: {train_loss} Train accuracy:  {train_acc}")
    
    print("Evaluating on validation data..................")
    val_loss, val_acc, val_pred_labels, val_pred_soft_labels = test(student_model, val_red_loader, criterion, val_logits)
    print(f"Validation loss: {val_loss} Validation accuracy:  {val_acc}")

    print("Evaluating on testing data..................")
    test_loss, test_acc, test_pred_labels, test_pred_soft_labels  = test(student_model, test_red_loader, criterion, test_logits)
    print(f"Testing loss: {test_loss} Test accuracy:  {test_acc}")
    
    # print(test_pred_labels.shape)
    return train_pred_labels, val_pred_labels, test_pred_labels, train_pred_soft_labels, val_pred_soft_labels, test_pred_soft_labels

def ensemble_models(train_pred_lists, val_pred_lists, test_pred_lists):
    
    train_pred = np.array([np.argmax(np.bincount(np.array(train_pred_lists)[:,i])) for i in range(len(train_pred_lists[1]))])
    val_pred = np.array([np.argmax(np.bincount(np.array(val_pred_lists)[:,i])) for i in range(len(val_pred_lists[1]))])
    test_pred = np.array([np.argmax(np.bincount(np.array(test_pred_lists)[:,i])) for i in range(len(test_pred_lists[1]))])
    
    return train_pred, val_pred, test_pred


Student model: - Adam optimizer - teacher_model_1

student_adam_1

In [27]:
student_adam_1 = MLP(0.1)
student_adam_1_optim = torch.optim.Adam(student_adam_1.parameters(), lr=1e-4)
train_pred_labels_student_adam_1, val_pred_labels_student_adam_1, test_pred_labels_student_adam_1, train_pred_soft_labels_student_adam_1, val_pred_soft_labels_student_adam_1, test_pred_soft_labels_student_adam_1 = train_student(student_adam_1, train_logits_1, val_logits_1, test_logits_1,train_loader_1, val_loader_1, test_loader_1, student_optimizer=student_adam_1_optim, lr=1e-4, epoch=50,  checkpoint_dir="./checkpoints/", checkpoint_file="student_adam_1.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 51
Evaluating on training data..................
1000 1
Training loss: 0.06988550788164138 Train accuracy:  0.418
Evaluating on validation data..................
220 1
Validation loss: 0.07294037016955289 Validation accuracy:  0.35909090909090907
Evaluating on testing data..................
220 1
Testing loss: 0.06973586949435147 Test accuracy:  0.44545454545454544


Student model: - SGD optimizer - teacher_model_1

student_sgd_1

In [28]:
student_sgd_1 = MLP(0.1)
student_sgd_1_optim = torch.optim.SGD(student_sgd_1.parameters(), lr=1e-4, momentum=0.9)
train_pred_labels_student_sgd_1, val_pred_labels_student_sgd_1, test_pred_labels_student_sgd_1, train_pred_soft_labels_student_sgd_1, val_pred_soft_labels_student_sgd_1, test_pred_soft_labels_student_sgd_1 = train_student(student_sgd_1, train_logits_1, val_logits_1, test_logits_1,train_loader_1, val_loader_1, test_loader_1, student_optimizer=student_sgd_1_optim, lr=1e-4, epoch=50,  checkpoint_dir="./checkpoints/", checkpoint_file="student_sgd_1.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 51
Evaluating on training data..................
1000 1
Training loss: 0.06988550788164138 Train accuracy:  0.418
Evaluating on validation data..................
220 1
Validation loss: 0.07294037016955289 Validation accuracy:  0.35909090909090907
Evaluating on testing data..................
220 1
Testing loss: 0.06973586949435147 Test accuracy:  0.44545454545454544


Using Adam optimiser

### Step four: Experimenting by increasing number of layers in student model

For 2 layers

In [29]:
class MLP_2(nn.Module):
    def __init__(self, l):
        super(MLP_2, self).__init__()
        bert_config = BertConfig(hidden_size=252, num_hidden_layers=2)
        self.bert = BertModel(bert_config)
        self.net1 = nn.Linear(252, 100)
        self.net4 = nn.Linear(100, 3)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]
        output = self.net1(bert_output[:, 0, :])
        output = self.leaky_relu(output)
        output = self.net4(output)
        output = torch.softmax(output, dim=1)
        return output

In [30]:
student_adam_2 = MLP_2(0.1)
student_adam_2_optim = torch.optim.Adam(student_adam_2.parameters(), lr=1e-4)
train_pred_labels_student_adam_2, val_pred_labels_student_adam_2, test_pred_labels_student_adam_2, train_pred_soft_labels_student_adam_2, val_pred_soft_labels_student_adam_2, test_pred_soft_labels_student_adam_2 = train_student(student_adam_2, train_logits_1, val_logits_1, test_logits_1,train_loader_1, val_loader_1, test_loader_1, student_optimizer=student_adam_2_optim, lr=1e-4, epoch=20, checkpoint_dir="./checkpoints/", checkpoint_file="student_adam_2.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Evaluating on training data..................
1000 1
Training loss: 0.07218236720561981 Train accuracy:  0.402
Evaluating on validation data..................
220 1
Validation loss: 0.07623248154466802 Validation accuracy:  0.3409090909090909
Evaluating on testing data..................
220 1
Testing loss: 0.07265133424238725 Test accuracy:  0.4090909090909091


For 3 layers

In [31]:
class MLP_3(nn.Module):
    def __init__(self, l):
        super(MLP_3, self).__init__()
        bert_config = BertConfig(hidden_size=252, num_hidden_layers=2)
        self.bert = BertModel(bert_config)
        self.net1 = nn.Linear(252, 60)
        self.net4 = nn.Linear(60, 15)
        self.net5 = nn.Linear(15, 3)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        output = self.leaky_relu(self.net1(bert_output[:, 0, :]))
        output = self.leaky_relu(self.net4(output))
        output = torch.softmax(self.net5(output), dim=1)
        return output

In [32]:
student_adam_3 = MLP_3(0.1)
student_adam_3_optim = torch.optim.Adam(student_adam_3.parameters(), lr=1e-4)
train_pred_labels_student_adam_3, val_pred_labels_student_adam_3, test_pred_labels_student_adam_3, train_pred_soft_labels_student_adam_3, val_pred_soft_labels_student_adam_3, test_pred_soft_labels_student_adam_3 = train_student(student_adam_3, train_logits_1, val_logits_1, test_logits_1,train_loader_1, val_loader_1, test_loader_1, student_optimizer=student_adam_3_optim, lr=1e-4, epoch=20, checkpoint_dir="./checkpoints/", checkpoint_file="student_adam_3.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Evaluating on training data..................
1000 1
Training loss: 0.0728652669787407 Train accuracy:  0.38
Evaluating on validation data..................
220 1
Validation loss: 0.0769100774418224 Validation accuracy:  0.3
Evaluating on testing data..................
220 1
Testing loss: 0.07193247161128304 Test accuracy:  0.41363636363636364


### Step five: Creating multiple 1-o-1 teacher student

Teacher models have learning rates 2e-5, 3e-5, 5e-5 as given in paper

Learning rate = 2e-5

In [33]:
teacher_model_11 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
train_loader_11, val_loader_11, test_loader_11, train_labels_11, val_labels_11, test_labels_11, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 220, 220, tokenizer_base_uncased)
train_logits_11, val_logits_11, test_logits_11 = fine_tune(teacher_model_11, train_loader_11, val_loader_11, test_loader_11, lr=2e-5, num_epochs=5, checkpoint_dir="./checkpoints/", checkpoint_file="teacher_model_11.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5545454545454546


In [34]:
student_11 = MLP(0.1)
student_11_optim = torch.optim.Adam(student_11.parameters(), lr=1e-4)
train_pred_labels_student_11, val_pred_labels_student_11, test_pred_labels_student_11, train_pred_soft_labels_student_11, val_pred_soft_labels_student_11, test_pred_soft_labels_student_11 = train_student(student_11, train_logits_11, val_logits_11, test_logits_11,train_loader_11, val_loader_11, test_loader_11, student_optimizer=student_11_optim, lr=1e-4, epoch=20, checkpoint_dir="./checkpoints/", checkpoint_file="student_11.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Evaluating on training data..................
1000 1
Training loss: 0.06833493030071258 Train accuracy:  0.432
Evaluating on validation data..................
220 1
Validation loss: 0.07114373608068987 Validation accuracy:  0.34545454545454546
Evaluating on testing data..................
220 1
Testing loss: 0.06951195380904458 Test accuracy:  0.39545454545454545


Learning Rate = 3e-5

In [35]:
teacher_model_12 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
train_loader_12, val_loader_12, test_loader_12, train_labels_12, val_labels_12, test_labels_12, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 220, 220, tokenizer_base_uncased)
train_logits_12, val_logits_12, test_logits_12 = fine_tune(teacher_model_12, train_loader_12, val_loader_12, test_loader_12, lr=3e-5, num_epochs=5, checkpoint_dir="./checkpoints/", checkpoint_file="teacher_model_12.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5590909090909091


In [36]:
student_12 = MLP(0.1)
student_12_optim = torch.optim.Adam(student_12.parameters(), lr=1e-4)
train_pred_labels_student_12, val_pred_labels_student_12, test_pred_labels_student_12, train_pred_soft_labels_student_12, val_pred_soft_labels_student_12, test_pred_soft_labels_student_12 = train_student(student_12, train_logits_12, val_logits_12, test_logits_12,train_loader_12, val_loader_12, test_loader_12, student_optimizer=student_12_optim, lr=1e-4, epoch=20, checkpoint_dir="./checkpoints/", checkpoint_file="student_12.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Evaluating on training data..................
1000 1
Training loss: 0.06961423182487488 Train accuracy:  0.384
Evaluating on validation data..................
220 1
Validation loss: 0.07084848040884191 Validation accuracy:  0.38636363636363635
Evaluating on testing data..................
220 1
Testing loss: 0.0704539878801866 Test accuracy:  0.39090909090909093


Learning Rate = 5e-5

In [37]:
teacher_model_13 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
train_loader_13, val_loader_13, test_loader_13, train_labels_13, val_labels_13, test_labels_13, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 220, 220, tokenizer_base_uncased)
train_logits_13, val_logits_13, test_logits_13 = fine_tune(teacher_model_13, train_loader_13, val_loader_13, test_loader_13, lr=5e-5, num_epochs=5, checkpoint_dir="./checkpoints/", checkpoint_file="teacher_model_13.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.6272727272727273


In [38]:
student_13 = MLP(0.1)
student_13_optim = torch.optim.Adam(student_13.parameters(), lr=1e-4)
train_pred_labels_student_13, val_pred_labels_student_13, test_pred_labels_student_13, train_pred_soft_labels_student_13, val_pred_soft_labels_student_13, test_pred_soft_labels_student_13 = train_student(student_13, train_logits_13, val_logits_13, test_logits_13,train_loader_13, val_loader_13, test_loader_13, student_optimizer=student_13_optim, lr=1e-4, epoch=20, checkpoint_dir="./checkpoints/", checkpoint_file="student_13.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Evaluating on training data..................
1000 1
Training loss: 0.07187032806873321 Train accuracy:  0.36
Evaluating on validation data..................
220 1
Validation loss: 0.07212309295480901 Validation accuracy:  0.38181818181818183
Evaluating on testing data..................
220 1
Testing loss: 0.0721461529081518 Test accuracy:  0.37727272727272726


### Step six: Performing majority voting ensemble on student models

In [67]:
import itertools
train_pred_lists = []
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_11)))
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_12)))
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_13)))

val_pred_lists = []
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_11)))
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_12)))
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_13)))

test_pred_lists = []
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_11)))
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_12)))
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_13)))

train_pred, val_pred, test_pred = ensemble_models(train_pred_lists, val_pred_lists, test_pred_lists)

In [68]:
from sklearn.metrics import accuracy_score
train_acc = accuracy_score(train_labels_1, train_pred)
val_acc = accuracy_score(val_labels_1, val_pred)
test_acc = accuracy_score(test_labels_1, test_pred)

print(f"Train accuracy:  {train_acc}")
print(f"Validation accuracy:  {val_acc}")
print(f"Test accuracy:  {test_acc}")

Train accuracy:  0.354
Validation accuracy:  0.33181818181818185
Test accuracy:  0.36818181818181817
