# SMAI Project - Model Compression with Two-stage Multi-teacher Knowledge Distillation for Web Question Answering System

## Group 2 - Synergy

| | |
|- | -|
| Team Members | Roll no |
| Anurag Ghosh | `2022202023` |
| Aryan Gupta | `2022202028` |
| Vedashree Ranade | `2022201073` |

### Step zero: Import Libraries

In [2]:
!pip3 install datasets
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Do

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import datasets
import transformers
import os
from transformers import AutoTokenizer
import torch.nn.functional as F
from transformers import BertModel, BertConfig
import numpy as np

### Step one: Loading Datasets


RTE Dataset

In [4]:
############################# rte dataset ###########################

rte = datasets.load_dataset('SetFit/rte')

print("Dataset:")
print(rte)

train_dataset = rte['train']
val_test_dataset = rte['validation']
test_dataset = rte['test']

split_size = len(val_test_dataset) // 2
val_dataset = val_test_dataset.select(range(0, split_size))
test_dataset = val_test_dataset.select(range(split_size, len(val_test_dataset)))

print("train size", len(train_dataset))
print("val size", len(val_dataset))
print("test size", len(test_dataset))

print(train_dataset[0])

Downloading readme:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading and preparing dataset json/SetFit--rte to /root/.cache/huggingface/datasets/SetFit___json/SetFit--rte-2cc6a4d81c8aa68d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/980k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/SetFit___json/SetFit--rte-2cc6a4d81c8aa68d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset:
DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 277
    })
    test: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 3000
    })
})
train size 2490
val size 138
test size 139
{'text1': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'text2': 'Weapons of Mass Destruction Found in Iraq.', 'label': 1, 'idx': 0, 'label_text': 'not entailment'}


In [5]:
class RTEDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

### Step two: Finetuning Teacher Model


In [10]:
def get_data(train_dataset, val_dataset, test_dataset, train_size, val_size, test_size, tokenizer):

  train_dataset = train_dataset.shuffle()
  val_dataset = val_dataset.shuffle()
  test_dataset = test_dataset.shuffle()

  # select a subset of the shuffled datasets
  train_subset = train_dataset.select(range(train_size))
  val_subset = val_dataset.select(range(val_size))
  test_subset = test_dataset.select(range(test_size)) 

  train_encodings = tokenizer(train_subset['text1'], train_subset['text2'], truncation=True, padding=True)
  val_encodings = tokenizer(val_subset['text1'], val_subset['text2'], truncation=True, padding=True)
  test_encodings = tokenizer(test_subset['text1'], test_subset['text2'], truncation=True, padding=True)

  train_labels = train_subset['label']
  val_labels = val_subset['label']
  test_labels = test_subset['label']

  train_dataset = RTEDataset(train_encodings, train_labels)
  val_dataset = RTEDataset(val_encodings, val_labels)
  test_dataset = RTEDataset(test_encodings, test_labels)

  train_loader = DataLoader(train_dataset, batch_size=16)
  val_loader = DataLoader(val_dataset, batch_size=16)
  test_loader = DataLoader(test_dataset, batch_size=16)

  return train_loader, val_loader, test_loader, train_labels, val_labels, test_labels, train_subset, val_subset, test_subset

def train_model(model, train_loader, optimizer, lr_scheduler):
    model.train()
    correct_predictions = 0
    total_predictions = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)    
        predictions = outputs.logits.argmax(dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += predictions.shape[0]      
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    
    return correct_predictions / total_predictions
    
def evaluate_model(loader, model):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    logits = np.empty([0, 2])
    i=0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            batch_logits = outputs.logits.cpu().numpy()
            logits = np.concatenate((logits, batch_logits), axis = 0)
            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += predictions.shape[0]

    return correct_predictions / total_predictions, logits

def fine_tune(teacher_model, train_loader, val_loader, test_loader, lr=5e-5, num_epochs=5, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    optimizer = transformers.AdamW(teacher_model.parameters(), lr=lr)
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = transformers.get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    checkpoint_path = checkpoint_dir + checkpoint_file
    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            teacher_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0

    for epoch in range(start_epoch, num_epochs):
        train_acc = train_model(teacher_model, train_loader, optimizer, lr_scheduler)
        val_acc, val_logits = evaluate_model(val_loader, teacher_model)
        print(f'Epoch {epoch + 1}: Train accuracy = {train_acc} Validation accuracy = {val_acc}')

        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': teacher_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': lr_scheduler.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)

    print("Evaluating on training data............")
    train_acc, train_logits = evaluate_model(train_loader, teacher_model)
    print("Evaluating on validation data............")
    val_acc, val_logits = evaluate_model(val_loader, teacher_model)
    print("Evaluating on testing data............")
    test_acc, test_logits = evaluate_model(test_loader, teacher_model)
    print(f"Test accuracy = {test_acc}")
    return train_logits, val_logits, test_logits

Teacher model bert base uncased and
Fine Tuning 3 teacher models on 3 different learning rates.

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
# tokenizer_base_uncased = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')

# train_loader_1, val_loader_1, test_loader_1, train_labels_1, val_labels_1, test_labels_1, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 138, 139, tokenizer_base_uncased)

lr = 2e-5

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
teacher_model_11 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_11, val_logits_11, test_logits_11 = fine_tune(teacher_model_11, train_loader_1, val_loader_1, test_loader_1, lr=2e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/m-o-1/", checkpoint_file="teacher_model_11.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.48 Validation accuracy = 0.6086956521739131
Epoch 2: Train accuracy = 0.629 Validation accuracy = 0.6594202898550725
Epoch 3: Train accuracy = 0.814 Validation accuracy = 0.6014492753623188
Epoch 4: Train accuracy = 0.905 Validation accuracy = 0.572463768115942
Epoch 5: Train accuracy = 0.946 Validation accuracy = 0.572463768115942
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5539568345323741


lr = 3e-5

In [15]:
teacher_model_12 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_12, val_logits_12, test_logits_12 = fine_tune(teacher_model_12, train_loader_1, val_loader_1, test_loader_1, lr=3e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/m-o-1/", checkpoint_file="teacher_model_12.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.523 Validation accuracy = 0.5434782608695652
Epoch 2: Train accuracy = 0.59 Validation accuracy = 0.572463768115942
Epoch 3: Train accuracy = 0.775 Validation accuracy = 0.5434782608695652
Epoch 4: Train accuracy = 0.926 Validation accuracy = 0.5797101449275363
Epoch 5: Train accuracy = 0.979 Validation accuracy = 0.5652173913043478
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.6258992805755396


lr = 5e-5

In [16]:
teacher_model_13 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_13, val_logits_13, test_logits_13 = fine_tune(teacher_model_13, train_loader_1, val_loader_1, test_loader_1, lr=5e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/m-o-1/", checkpoint_file="teacher_model_13.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.503 Validation accuracy = 0.6304347826086957
Epoch 2: Train accuracy = 0.565 Validation accuracy = 0.5072463768115942
Epoch 3: Train accuracy = 0.696 Validation accuracy = 0.572463768115942
Epoch 4: Train accuracy = 0.855 Validation accuracy = 0.5217391304347826
Epoch 5: Train accuracy = 0.919 Validation accuracy = 0.5652173913043478
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5539568345323741


### Step three: Student Model

In [18]:
class MLP(nn.Module):
    def __init__(self, l):
        super(MLP, self).__init__()
        bert_config = BertConfig(hidden_size=252, num_hidden_layers=2)
        self.bert = BertModel(bert_config)
        self.net1 = nn.Linear(252, 2)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        output = torch.softmax(self.net1(bert_output[:, 0]),dim=1)
        return output

In [62]:
def train(student_model, train_loader, optimizer, criterion, train_logits_list, train_labels, alpha):
    student_model.train()
    train_loss = 0
    train_correct = 0
    i=0
    for data in train_loader:
        batch_size = len(data['input_ids'].to(device))
        optimizer.zero_grad()
        output = student_model(data['input_ids'].to(device), data['attention_mask'].to(device))

        # Loss using soft labels of teacher
        loss_s = 0
        for train_logits in train_logits_list:
            target = torch.tensor(train_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
            loss_s += criterion(output, torch.softmax(target, dim=1, dtype = torch.float32))
        loss_s /= len(train_logits_list)

        # Loss using ground truth
        batch_labels = train_labels[i*batch_size:(i+1)*batch_size]
        ground_truth = torch.tensor(batch_labels, dtype=torch.int).to(device)
        ground_truth = torch.eye(torch.max(ground_truth).item()+1, dtype = torch.int).to(device)[ground_truth]
        loss_g = criterion(output, torch.softmax(ground_truth, dim=1, dtype=torch.float32))

        #Combining the losses
        loss = (alpha)*loss_s + (1-alpha)*loss_g


        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        actual = target.argmax(dim=1, keepdim=True)
        train_correct += pred.eq(actual.view_as(pred)).sum().item()
        i +=1
    train_loss /= len(train_loader.dataset)
    train_acc = train_correct / len(train_loader.dataset)
    return train_loss, train_acc

def test(student_model, test_loader, criterion, test_logits_list, test_labels, alpha):
    student_model.eval()
    test_loss = 0
    test_correct = 0
    pred_labels = []
    pred_soft_labels = []
    i=0
    with torch.no_grad():
        for data in test_loader:
            batch_size = len(data['input_ids'].to(device))
            output = student_model(data['input_ids'].to(device), data['attention_mask'].to(device))
            
            loss_s = 0
            for test_logits in test_logits_list:
                target = torch.tensor(test_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
                loss_s += criterion(output, torch.softmax(target, dim=1, dtype = torch.float32))
            loss_s /= len(test_logits_list)

            # Loss using ground truth
            batch_labels = test_labels[i*batch_size:(i+1)*batch_size]
            ground_truth = torch.tensor(batch_labels, dtype=torch.int).to(device)
            ground_truth = torch.eye(torch.max(ground_truth).item()+1, dtype = torch.int).to(device)[ground_truth]
            loss_g = criterion(output, torch.softmax(ground_truth, dim=1, dtype=torch.float32))

            #Combining the losses
            loss = (alpha)*loss_s + (1-alpha)*loss_g

            test_loss += criterion(output, torch.softmax(target, dim=1, dtype = torch.float32)).item()
            pred = output.argmax(dim=1, keepdim=True)
            pred_labels.extend(pred.tolist())
            pred_soft_labels.extend(output.max(dim=1)[0].tolist())
            actual = target.argmax(dim=1, keepdim=True)
            test_correct += pred.eq(actual.view_as(pred)).sum().item()
            i += 1
    test_loss /= len(test_loader.dataset)
    test_acc = test_correct / len(test_loader.dataset)

    print(len(pred_labels), len(pred_labels[0]))
    return test_loss, test_acc, np.array(pred_labels).reshape(-1,1), np.array(pred_soft_labels).reshape(-1,1)

def train_student(student_model, train_logits_list, val_logits_list, test_logits_list, train_red_loader, val_red_loader, test_red_loader,train_labels, val_labels, test_labels, alpha, student_optimizer=None, lr=1e-4, epoch=50, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    if(student_optimizer is None): student_optimizer = torch.optim.Adam(student_model.parameters(), lr)
    criterion = nn.CrossEntropyLoss()
    # epoch = 30
    print("Training...........")
    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            student_model.load_state_dict(checkpoint['model_state_dict'])
            student_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0
    for i in range(start_epoch, epoch):
        train_loss, train_acc = train(student_model, train_red_loader, student_optimizer, criterion, train_logits_list, train_labels, alpha)
        if(i%10 == 0):   print(f"Epoch {i}   Train loss: {train_loss}    Train accuracy: {train_acc}")
        # print(train_loss, train_acc)
        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': student_model.state_dict(),
                'optimizer_state_dict': student_optimizer.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)
    
    print("Evaluating on training data..................")
    train_loss, train_acc, train_pred_labels, train_pred_soft_labels = test(student_model, train_red_loader, criterion, train_logits_list, train_labels, alpha)
    print(f"Training loss: {train_loss} Train accuracy:  {train_acc}")
    
    print("Evaluating on validation data..................")
    val_loss, val_acc, val_pred_labels, val_pred_soft_labels = test(student_model, val_red_loader, criterion, val_logits_list, val_labels, alpha)
    print(f"Validation loss: {val_loss} Validation accuracy:  {val_acc}")

    print("Evaluating on testing data..................")
    test_loss, test_acc, test_pred_labels, test_pred_soft_labels  = test(student_model, test_red_loader, criterion, test_logits_list, test_labels, alpha)
    print(f"Testing loss: {test_loss} Test accuracy:  {test_acc}")
    
    # print(test_pred_labels.shape)
    return train_pred_labels, val_pred_labels, test_pred_labels, train_pred_soft_labels, val_pred_soft_labels, test_pred_soft_labels

def ensemble_models(train_pred_lists, val_pred_lists, test_pred_lists):
    
    train_pred = np.array([np.argmax(np.bincount(np.array(train_pred_lists)[:,i])) for i in range(len(train_pred_lists[1]))])
    val_pred = np.array([np.argmax(np.bincount(np.array(val_pred_lists)[:,i])) for i in range(len(val_pred_lists[1]))])
    test_pred = np.array([np.argmax(np.bincount(np.array(test_pred_lists)[:,i])) for i in range(len(test_pred_lists[1]))])
    
    return train_pred, val_pred, test_pred


Creating student model

In [64]:
student_11 = MLP(0.1).to(device)
student_11_optim = torch.optim.Adam(student_11.parameters(), lr=1e-4)
train_logits_list_1 = [train_logits_11, train_logits_12, train_logits_13]
val_logits_list_1 = [val_logits_11, val_logits_12, val_logits_13]
test_logits_list_1 = [test_logits_11, test_logits_12, test_logits_13]
train_pred_labels_student_11, val_pred_labels_student_11, test_pred_labels_student_11, train_pred_soft_labels_student_11, val_pred_soft_labels_student_11, test_pred_soft_labels_student_11 = train_student(student_11, train_logits_list_1, val_logits_list_1, test_logits_list_1,train_loader_1, val_loader_1, test_loader_1,train_labels_1, val_labels_1, test_labels_1,alpha=0.7, student_optimizer=student_11_optim, lr=1e-4, epoch=50, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/m-o-1/", checkpoint_file="student_11.pth" )

Training...........
Checkpoint accessing...........
Resuming training from epoch 21
Epoch 30   Train loss: 0.0290820110142231    Train accuracy: 0.962
Epoch 40   Train loss: 0.02895363214612007    Train accuracy: 0.97
Evaluating on training data..................
1000 1
Training loss: 0.02603114864230156 Train accuracy:  0.969
Evaluating on validation data..................
138 1
Validation loss: 0.04896077340927677 Validation accuracy:  0.5869565217391305
Evaluating on testing data..................
139 1
Testing loss: 0.04710530548644581 Test accuracy:  0.5827338129496403


## Stage 2 Fine tuning student model on qnli dataset

First fine tuning teachers on qnli dataset

In [65]:
qnli = datasets.load_dataset('SetFit/qnli')

print("Dataset:")
print(qnli)

train_dataset_qnli = qnli['train']
val_test_dataset = qnli['validation']
# test_dataset = qnli['test']

split_size = len(val_test_dataset) // 2
val_dataset_qnli = val_test_dataset.select(range(0, split_size))
test_dataset_qnli = val_test_dataset.select(range(split_size, len(val_test_dataset)))

print("train size", len(train_dataset_qnli))
print("val size", len(val_dataset_qnli))
print("test size", len(test_dataset_qnli))

# print(test_dataset[1])

Downloading readme:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading and preparing dataset json/SetFit--qnli to /root/.cache/huggingface/datasets/SetFit___json/SetFit--qnli-324fd6914ad1beff/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/SetFit___json/SetFit--qnli-324fd6914ad1beff/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset:
DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 5463
    })
})
train size 104743
val size 2731
test size 2732


In [67]:
class  QNLIDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [68]:
def get_data_qnli(train_dataset, val_dataset, test_dataset, train_size, val_size, test_size, tokenizer):

  train_dataset = train_dataset.shuffle()
  val_dataset = val_dataset.shuffle()
  test_dataset = test_dataset.shuffle()

  # select a subset of the shuffled datasets
  train_subset = train_dataset.select(range(train_size))
  val_subset = val_dataset.select(range(val_size))
  test_subset = test_dataset.select(range(test_size)) 

  # unique, counts = np.unique(test_subset['label'], return_counts=True)
  # dict(zip(unique, counts))

  train_encodings = tokenizer(train_subset['text1'], train_subset['text2'], truncation=True, padding=True)
  val_encodings = tokenizer(val_subset['text1'], val_subset['text2'], truncation=True, padding=True)
  test_encodings = tokenizer(test_subset['text1'], test_subset['text2'], truncation=True, padding=True)

  train_labels = train_subset['label']
  val_labels = val_subset['label']
  test_labels = test_subset['label']

  train_dataset = QNLIDataset(train_encodings, train_labels)
  val_dataset = QNLIDataset(val_encodings, val_labels)
  test_dataset = QNLIDataset(test_encodings, test_labels)

  train_loader = DataLoader(train_dataset, batch_size=16)
  val_loader = DataLoader(val_dataset, batch_size=16)
  test_loader = DataLoader(test_dataset, batch_size=16)

  return train_loader, val_loader, test_loader, train_labels, val_labels, test_labels, train_subset, val_subset, test_subset



In [69]:
# tokenizer_base_uncased = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')
# teacher_model_21 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

train_loader_21, val_loader_21, test_loader_21, train_labels_21, val_labels_21, test_labels_21, train_subset_qnli, val_subset_qnli, test_subset_qnli = get_data_qnli(train_dataset_qnli, val_dataset_qnli, test_dataset_qnli, 1000, 220, 220, tokenizer_base_uncased)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

lr = 5e-5

In [70]:
teacher_model_21 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_21, val_logits_21, test_logits_21 = fine_tune(teacher_model_21, train_loader_21, val_loader_21, test_loader_21, lr=5e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/qnli/", checkpoint_file="teacher_model_21.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.478 Validation accuracy = 0.5409090909090909
Epoch 2: Train accuracy = 0.722 Validation accuracy = 0.7772727272727272
Epoch 3: Train accuracy = 0.854 Validation accuracy = 0.7045454545454546
Epoch 4: Train accuracy = 0.957 Validation accuracy = 0.7636363636363637
Epoch 5: Train accuracy = 0.985 Validation accuracy = 0.7545454545454545
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.8181818181818182


lr = 3e-5

In [71]:
teacher_model_22 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_22, val_logits_22, test_logits_22 = fine_tune(teacher_model_22, train_loader_21, val_loader_21, test_loader_21, lr=3e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/qnli/", checkpoint_file="teacher_model_22.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.509 Validation accuracy = 0.5727272727272728
Epoch 2: Train accuracy = 0.744 Validation accuracy = 0.7272727272727273
Epoch 3: Train accuracy = 0.857 Validation accuracy = 0.740909090909091
Epoch 4: Train accuracy = 0.918 Validation accuracy = 0.759090909090909
Epoch 5: Train accuracy = 0.955 Validation accuracy = 0.7636363636363637
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.7636363636363637


lr = 2e-5

In [72]:
teacher_model_23 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_logits_23, val_logits_23, test_logits_23 = fine_tune(teacher_model_23, train_loader_21, val_loader_21, test_loader_21, lr=2e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/qnli/", checkpoint_file="teacher_model_23.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.499 Validation accuracy = 0.5454545454545454
Epoch 2: Train accuracy = 0.677 Validation accuracy = 0.6954545454545454
Epoch 3: Train accuracy = 0.848 Validation accuracy = 0.6909090909090909
Epoch 4: Train accuracy = 0.916 Validation accuracy = 0.7136363636363636
Epoch 5: Train accuracy = 0.969 Validation accuracy = 0.7136363636363636
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.759090909090909


Fine tuning pre-trained student on qnli dataset

In [73]:
train_logits_list_2 = [train_logits_21, train_logits_22, train_logits_23]
val_logits_list_2 = [val_logits_21, val_logits_22, val_logits_23]
test_logits_list_2 = [test_logits_21, test_logits_22, test_logits_23]
train_pred_labels_student_12, val_pred_labels_student_12, test_pred_labels_student_12, train_pred_soft_labels_student_12, val_pred_soft_labels_student_12, test_pred_soft_labels_student_12 = train_student(student_11, train_logits_list_2, val_logits_list_2, test_logits_list_2,train_loader_21, val_loader_21, test_loader_21,train_labels_21, val_labels_21, test_labels_21,alpha=0.7, student_optimizer=student_11_optim, lr=1e-4, epoch=50, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/m-o-1/", checkpoint_file="student_12.pth" )

Training...........
No saved checkpoints to resume
Epoch 0   Train loss: 0.04956310951709747    Train accuracy: 0.509
Epoch 10   Train loss: 0.028006032228469847    Train accuracy: 0.975
Epoch 20   Train loss: 0.027910725146532058    Train accuracy: 0.977
Epoch 30   Train loss: 0.027913129150867463    Train accuracy: 0.978
Epoch 40   Train loss: 0.027914887696504593    Train accuracy: 0.978
Evaluating on training data..................
1000 1
Training loss: 0.025544155269861223 Train accuracy:  0.978
Evaluating on validation data..................
220 1
Validation loss: 0.05101537812839855 Validation accuracy:  0.4818181818181818
Evaluating on testing data..................
220 1
Testing loss: 0.04940701127052307 Test accuracy:  0.5318181818181819
