# SMAI Project - Model Compression with Two-stage Multi-teacher Knowledge Distillation for Web Question Answering System

## Group 2 - Synergy

| | |
|- | -|
| Team Members | Roll no |
| Anurag Ghosh | `2022202023` |
| Aryan Gupta | `2022202028` |
| Vedashree Ranade | `2022201073` |

### Step zero: Import Libraries

In [None]:
!pip3 install datasets
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import datasets
import transformers
import os
from transformers import AutoTokenizer
import torch.nn.functional as F
from transformers import BertModel, BertConfig
import numpy as np

### Step one: Loading Datasets


RTE Dataset

In [None]:
############################# rte dataset ###########################

rte = datasets.load_dataset('SetFit/rte')

print("Dataset:")
print(rte)

train_dataset = rte['train']
val_test_dataset = rte['validation']
test_dataset = rte['test']

split_size = len(val_test_dataset) // 2
val_dataset = val_test_dataset.select(range(0, split_size))
test_dataset = val_test_dataset.select(range(split_size, len(val_test_dataset)))

print("train size", len(train_dataset))
print("val size", len(val_dataset))
print("test size", len(test_dataset))

print(train_dataset[0])

Downloading readme:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading and preparing dataset json/SetFit--rte to /root/.cache/huggingface/datasets/SetFit___json/SetFit--rte-2cc6a4d81c8aa68d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/980k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/SetFit___json/SetFit--rte-2cc6a4d81c8aa68d/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset:
DatasetDict({
    train: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 277
    })
    test: Dataset({
        features: ['text1', 'text2', 'label', 'idx', 'label_text'],
        num_rows: 3000
    })
})
train size 2490
val size 138
test size 139
{'text1': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'text2': 'Weapons of Mass Destruction Found in Iraq.', 'label': 1, 'idx': 0, 'label_text': 'not entailment'}


In [None]:
class RTEDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

### Step two: Finetuning Teacher Model


In [None]:
def get_data(train_dataset, val_dataset, test_dataset, train_size, val_size, test_size, tokenizer):

  train_dataset = train_dataset.shuffle()
  val_dataset = val_dataset.shuffle()
  test_dataset = test_dataset.shuffle()

  # select a subset of the shuffled datasets
  train_subset = train_dataset.select(range(train_size))
  val_subset = val_dataset.select(range(val_size))
  test_subset = test_dataset.select(range(test_size)) 

  # unique, counts = np.unique(test_subset['label'], return_counts=True)
  # dict(zip(unique, counts))

  train_encodings = tokenizer(train_subset['text1'], train_subset['text2'], truncation=True, padding=True)
  val_encodings = tokenizer(val_subset['text1'], val_subset['text2'], truncation=True, padding=True)
  test_encodings = tokenizer(test_subset['text1'], test_subset['text2'], truncation=True, padding=True)

  train_labels = train_subset['label']
  val_labels = val_subset['label']
  test_labels = test_subset['label']

  train_dataset = RTEDataset(train_encodings, train_labels)
  val_dataset = RTEDataset(val_encodings, val_labels)
  test_dataset = RTEDataset(test_encodings, test_labels)

  train_loader = DataLoader(train_dataset, batch_size=16)
  val_loader = DataLoader(val_dataset, batch_size=16)
  test_loader = DataLoader(test_dataset, batch_size=16)

  return train_loader, val_loader, test_loader, train_labels, val_labels, test_labels, train_subset, val_subset, test_subset

def train_model(model, train_loader, optimizer, lr_scheduler):
    model.train()
    correct_predictions = 0
    total_predictions = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        predictions = outputs.logits.argmax(dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += predictions.shape[0]
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    # print("Training completed...")
    
    return correct_predictions / total_predictions
    
def evaluate_model(loader, model):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    logits = np.empty([0, 2])
    i=0
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            batch_logits = outputs.logits.cpu().numpy()
            logits = np.concatenate((logits, batch_logits), axis = 0)
            predictions = outputs.logits.argmax(dim=-1)
            
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += predictions.shape[0]

    return correct_predictions / total_predictions, logits

def fine_tune(teacher_model, train_loader, val_loader, test_loader, lr=5e-5, num_epochs=5, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    optimizer = transformers.AdamW(teacher_model.parameters(), lr=lr)
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = transformers.get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            teacher_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0

    for epoch in range(start_epoch, num_epochs):
        train_acc = train_model(teacher_model, train_loader, optimizer, lr_scheduler)
        val_acc, val_logits = evaluate_model(val_loader, teacher_model)
        print(f'Epoch {epoch + 1}: Train accuracy = {train_acc} Validation accuracy = {val_acc}')

        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': teacher_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': lr_scheduler.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)

    print("Evaluating on training data............")
    train_acc, train_logits = evaluate_model(train_loader, teacher_model)
    print("Evaluating on validation data............")
    val_acc, val_logits = evaluate_model(val_loader, teacher_model)
    print("Evaluating on testing data............")
    test_acc, test_logits = evaluate_model(test_loader, teacher_model)
    print(f"Test accuracy = {test_acc}")
    return train_logits, val_logits, test_logits

Teacher model bert base uncased

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer_base_uncased = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')
teacher_model_1 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

train_loader_1, val_loader_1, test_loader_1, train_labels_1, val_labels_1, test_labels_1, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 138, 139, tokenizer_base_uncased)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
train_logits_1, val_logits_1, test_logits_1 = fine_tune(teacher_model_1, train_loader_1, val_loader_1, test_loader_1, lr=5e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="teacher_model_1.pth")



Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5251798561151079


Teacher model bert large uncased

In [None]:
# tokenizer_large_uncased = transformers.BertTokenizerFast.from_pretrained('bert-large-uncased')
# teacher_model_2 = transformers.BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)

# train_loader_2, val_loader_2, test_loader_2, train_labels_2, val_labels_2, test_labels_2, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 500, 100, 100, tokenizer_large_uncased)

In [None]:
# train_logits_2, val_logits_2, test_logits_2 = fine_tune(teacher_model_2, train_loader_2, val_loader_2, test_loader_2,lr=5e-5, checkpoint_dir="./checkpoints/rte/", checkpoint_file="teacher_model_2.pth")

### Step three: Student Model

In [None]:
class MLP(nn.Module):
    def __init__(self, l):
        super(MLP, self).__init__()
        bert_config = BertConfig(hidden_size=252, num_hidden_layers=2)
        self.bert = BertModel(bert_config)
        self.net1 = nn.Linear(252, 2)
        # self.net4 = nn.Linear(60, 15)
        # self.net5 = nn.Linear(15, 3)
        self.leaky_relu = nn.LeakyReLU(l)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
        output = torch.softmax(self.net1(bert_output[:, 0]),dim=1)
        # output = self.net1(bert_output[:, 0])
        # output = self.leaky_relu(self.net4(output))
        # output = self.leaky_relu(self.net5(output))
        return output

In [None]:
def train(student_model, train_loader, optimizer, criterion, train_logits):
    student_model.train()
    train_loss = 0
    train_correct = 0
    i=0
    for data in train_loader:
        batch_size = len(data['input_ids'].to(device))
        optimizer.zero_grad()
        output = student_model(data['input_ids'].to(device), data['attention_mask'].to(device))
        target = torch.tensor(train_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
        loss = criterion(output, torch.softmax(target, dim=1, dtype = torch.float32))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        actual = target.argmax(dim=1, keepdim=True)
        train_correct += pred.eq(actual.view_as(pred)).sum().item()
        i +=1
    train_loss /= len(train_loader.dataset)
    train_acc = train_correct / len(train_loader.dataset)
    return train_loss, train_acc

def test(student_model, test_loader, criterion, test_logits):
    student_model.eval()
    test_loss = 0
    test_correct = 0
    pred_labels = []
    pred_soft_labels = []
    i=0
    with torch.no_grad():
        for data in test_loader:
            batch_size = len(data['input_ids'].to(device))
            output = student_model(data['input_ids'].to(device), data['attention_mask'].to(device))
            target = torch.tensor(test_logits[i*batch_size:(i+1)*batch_size, :], dtype=torch.float32).to(device)
            test_loss += criterion(output, torch.softmax(target, dim=1, dtype = torch.float32)).item()
            pred = output.argmax(dim=1, keepdim=True)
            pred_labels.extend(pred.tolist())
            pred_soft_labels.extend(output.max(dim=1)[0].tolist())
            actual = target.argmax(dim=1, keepdim=True)
            test_correct += pred.eq(actual.view_as(pred)).sum().item()
            i += 1
    test_loss /= len(test_loader.dataset)
    test_acc = test_correct / len(test_loader.dataset)
    
    print(len(pred_labels), len(pred_labels[0]))
    return test_loss, test_acc, np.array(pred_labels).reshape(-1,1), np.array(pred_soft_labels).reshape(-1,1)

def train_student(student_model, train_logits, val_logits, test_logits, train_red_loader, val_red_loader, test_red_loader, student_optimizer=None, lr=1e-4, epoch=50, checkpoint_dir=None, resume_from_checkpoint=True, checkpoint_file=None):
    if(student_optimizer is None): student_optimizer = torch.optim.Adam(student_model.parameters(), lr)
    criterion = nn.CrossEntropyLoss()
    # epoch = 30
    print("Training...........")
    checkpoint_path = checkpoint_dir + checkpoint_file

    if resume_from_checkpoint:
        if checkpoint_path is None or not os.path.exists(checkpoint_path):
            start_epoch = 0
            print("No saved checkpoints to resume")
        else:
            print("Checkpoint accessing...........")
            checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            student_model.load_state_dict(checkpoint['model_state_dict'])
            student_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            print(f"Resuming training from epoch {start_epoch}")
    else:
        start_epoch = 0
    for i in range(start_epoch, epoch):
        train_loss, train_acc = train(student_model, train_red_loader, student_optimizer, criterion, train_logits)
        if(i%10 == 0):   print(f"Epoch {i}   Train loss: {train_loss}    Train accuracy: {train_acc}")
        # print(train_loss, train_acc)
        if checkpoint_dir is not None:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': student_model.state_dict(),
                'optimizer_state_dict': student_optimizer.state_dict()
            }
            torch.save(checkpoint, checkpoint_path)
    
    print("Evaluating on training data..................")
    train_loss, train_acc, train_pred_labels, train_pred_soft_labels = test(student_model, train_red_loader, criterion, train_logits)
    print(f"Training loss: {train_loss} Train accuracy:  {train_acc}")
    
    print("Evaluating on validation data..................")
    val_loss, val_acc, val_pred_labels, val_pred_soft_labels = test(student_model, val_red_loader, criterion, val_logits)
    print(f"Validation loss: {val_loss} Validation accuracy:  {val_acc}")

    print("Evaluating on testing data..................")
    test_loss, test_acc, test_pred_labels, test_pred_soft_labels  = test(student_model, test_red_loader, criterion, test_logits)
    print(f"Testing loss: {test_loss} Test accuracy:  {test_acc}")
    
    # print(test_pred_labels.shape)
    return train_pred_labels, val_pred_labels, test_pred_labels, train_pred_soft_labels, val_pred_soft_labels, test_pred_soft_labels

def ensemble_models(train_pred_lists, val_pred_lists, test_pred_lists):
    
    train_pred = np.array([np.argmax(np.bincount(np.array(train_pred_lists)[:,i])) for i in range(len(train_pred_lists[1]))])
    val_pred = np.array([np.argmax(np.bincount(np.array(val_pred_lists)[:,i])) for i in range(len(val_pred_lists[1]))])
    test_pred = np.array([np.argmax(np.bincount(np.array(test_pred_lists)[:,i])) for i in range(len(test_pred_lists[1]))])
    
    return train_pred, val_pred, test_pred


### Step four: Creating multiple 1-o-1 teacher student

Teacher models have learning rates 2e-5, 3e-5, 5e-5 as given in paper

Learning rate = 2e-5

In [None]:
teacher_model_11 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_loader_11, val_loader_11, test_loader_11, train_labels_11, val_labels_11, test_labels_11, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 138, 139, tokenizer_base_uncased)
train_logits_11, val_logits_11, test_logits_11 = fine_tune(teacher_model_11, train_loader_11, val_loader_11, test_loader_11, lr=2e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="teacher_model_11.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Checkpoint accessing...........
Resuming training from epoch 5
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5683453237410072


In [None]:
student_11 = MLP(0.1).to(device)
student_11_optim = torch.optim.Adam(student_11.parameters(), lr=1e-4)
train_pred_labels_student_11, val_pred_labels_student_11, test_pred_labels_student_11, train_pred_soft_labels_student_11, val_pred_soft_labels_student_11, test_pred_soft_labels_student_11 = train_student(student_11, train_logits_11, val_logits_11, test_logits_11,train_loader_11, val_loader_11, test_loader_11, student_optimizer=student_11_optim, lr=1e-4, epoch=20, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="student_11.pth" )

Training...........
No saved checkpoints to resume
Epoch 0   Train loss: 0.04557694941759109    Train accuracy: 0.508
Epoch 10   Train loss: 0.03147113218903542    Train accuracy: 0.981
Evaluating on training data..................
1000 1
Training loss: 0.03117045173048973 Train accuracy:  0.983
Evaluating on validation data..................
138 1
Validation loss: 0.04261698195899742 Validation accuracy:  0.7101449275362319
Evaluating on testing data..................
139 1
Testing loss: 0.0451439914943503 Test accuracy:  0.6258992805755396


Learning Rate = 3e-5

In [None]:
teacher_model_12 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_loader_12, val_loader_12, test_loader_12, train_labels_12, val_labels_12, test_labels_12, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 138, 139, tokenizer_base_uncased)
train_logits_12, val_logits_12, test_logits_12 = fine_tune(teacher_model_12, train_loader_12, val_loader_12, test_loader_12, lr=3e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="teacher_model_12.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.551 Validation accuracy = 0.5289855072463768
Epoch 2: Train accuracy = 0.731 Validation accuracy = 0.5797101449275363
Epoch 3: Train accuracy = 0.906 Validation accuracy = 0.5652173913043478
Epoch 4: Train accuracy = 0.954 Validation accuracy = 0.5942028985507246
Epoch 5: Train accuracy = 0.986 Validation accuracy = 0.6159420289855072
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5899280575539568


In [None]:
student_12 = MLP(0.1).to(device)
student_12_optim = torch.optim.Adam(student_12.parameters(), lr=1e-4)
train_pred_labels_student_12, val_pred_labels_student_12, test_pred_labels_student_12, train_pred_soft_labels_student_12, val_pred_soft_labels_student_12, test_pred_soft_labels_student_12 = train_student(student_12, train_logits_12, val_logits_12, test_logits_12,train_loader_12, val_loader_12, test_loader_12, student_optimizer=student_12_optim, lr=1e-4, epoch=20, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="student_12.pth" )

Training...........
No saved checkpoints to resume
Epoch 0   Train loss: 0.04449286979436874    Train accuracy: 0.508
Epoch 10   Train loss: 0.02267818224430084    Train accuracy: 0.985
Evaluating on training data..................
1000 1
Training loss: 0.022349762618541717 Train accuracy:  0.991
Evaluating on validation data..................
138 1
Validation loss: 0.047240701706513115 Validation accuracy:  0.6086956521739131
Evaluating on testing data..................
139 1
Testing loss: 0.04741810851817509 Test accuracy:  0.5899280575539568


Learning Rate = 5e-5

In [None]:
teacher_model_13 = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
train_loader_13, val_loader_13, test_loader_13, train_labels_13, val_labels_13, test_labels_13, train_subset, val_subset, test_subset = get_data(train_dataset, val_dataset, test_dataset, 1000, 138, 139, tokenizer_base_uncased)
train_logits_13, val_logits_13, test_logits_13 = fine_tune(teacher_model_13, train_loader_13, val_loader_13, test_loader_13, lr=5e-5, num_epochs=5, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="teacher_model_13.pth")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

No saved checkpoints to resume
Epoch 1: Train accuracy = 0.5 Validation accuracy = 0.463768115942029
Epoch 2: Train accuracy = 0.626 Validation accuracy = 0.572463768115942
Epoch 3: Train accuracy = 0.822 Validation accuracy = 0.572463768115942
Epoch 4: Train accuracy = 0.928 Validation accuracy = 0.6014492753623188
Epoch 5: Train accuracy = 0.964 Validation accuracy = 0.6594202898550725
Evaluating on training data............
Evaluating on validation data............
Evaluating on testing data............
Test accuracy = 0.5179856115107914


In [None]:
student_13 = MLP(0.1).to(device)
student_13_optim = torch.optim.Adam(student_13.parameters(), lr=1e-4)
train_pred_labels_student_13, val_pred_labels_student_13, test_pred_labels_student_13, train_pred_soft_labels_student_13, val_pred_soft_labels_student_13, test_pred_soft_labels_student_13 = train_student(student_13, train_logits_13, val_logits_13, test_logits_13,train_loader_13, val_loader_13, test_loader_13, student_optimizer=student_13_optim, lr=1e-4, epoch=20, checkpoint_dir="/content/drive/MyDrive/SMAI_project/checkpoints/rte/", checkpoint_file="student_13.pth" )

Training...........
No saved checkpoints to resume
Epoch 0   Train loss: 0.045628379344940184    Train accuracy: 0.503
Epoch 10   Train loss: 0.022301084905862807    Train accuracy: 0.983
Evaluating on training data..................
1000 1
Training loss: 0.021385007053613663 Train accuracy:  0.999
Evaluating on validation data..................
138 1
Validation loss: 0.04508689564207326 Validation accuracy:  0.6159420289855072
Evaluating on testing data..................
139 1
Testing loss: 0.04509318217956763 Test accuracy:  0.6115107913669064


### Step five: Performing majority voting ensemble on student models

In [None]:
import itertools
train_pred_lists = []
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_11)))
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_12)))
train_pred_lists.append(list(itertools.chain.from_iterable(train_pred_labels_student_13)))

val_pred_lists = []
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_11)))
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_12)))
val_pred_lists.append(list(itertools.chain.from_iterable(val_pred_labels_student_13)))

test_pred_lists = []
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_11)))
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_12)))
test_pred_lists.append(list(itertools.chain.from_iterable(test_pred_labels_student_13)))

train_pred, val_pred, test_pred = ensemble_models(train_pred_lists, val_pred_lists, test_pred_lists)

In [None]:
from sklearn.metrics import accuracy_score
train_acc = accuracy_score(train_labels_1, train_pred)
val_acc = accuracy_score(val_labels_1, val_pred)
test_acc = accuracy_score(test_labels_1, test_pred)

print(f"Train accuracy:  {train_acc}")
print(f"Validation accuracy:  {val_acc}")
print(f"Test accuracy:  {test_acc}")

Train accuracy:  0.482
Validation accuracy:  0.5652173913043478
Test accuracy:  0.4748201438848921
