# Setup


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import os
# os.chdir('/content/drive/MyDrive/Projects/Distillation')

In [3]:
# !pip install datasets --quiet
# !pip install evaluate --quiet

In [4]:
import torch

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


In [5]:
import random
import numpy as np

# Ensure experiment reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)
np.random.seed(SEED)

# Dataset



[HateXplain](https://huggingface.co/datasets/Hate-speech-CNERG/hatexplain) is a benchmark dataset for hate speech detection.   
Each record in the dataset includes four fields:
- id: A unique identifier for the post.
- annotators: A list of annotations made by 3 different annotators. Each annotator entry includes:  
  - label: The label given to the post. 0: hatespeech, 1: normal, 2: offensive.
  - annotator_id: A unique identifier for the annotator.
  - target: The target of the post.
- rationales: A list of binary arrays. 1 indicates a token that can justify the label assigned by the annotator.

- post_tokens: A list of tokens (words) of the post.

In [6]:
from datasets import load_dataset

dataset = load_dataset("Hate-speech-CNERG/hatexplain", trust_remote_code=True)

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1924
    })
})


In [8]:
# Example
print(dataset['train'][0])

{'id': '23107796_gab', 'annotators': {'label': [0, 2, 2], 'annotator_id': [203, 204, 233], 'target': [['Hindu', 'Islam'], ['Hindu', 'Islam'], ['Hindu', 'Islam', 'Other']]}, 'rationales': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'post_tokens': ['u', 'really', 'think', 'i', 'would', 'not', 'have', 'been', 'raped', 'by', 'feral', 'hindu', 'or', 'muslim', 'back', 'in', 'india', 'or', 'bangladesh', 'and', 'a', 'neo', 'nazi', 'would', 'rape', 'me', 'as', 'well', 'just', 'to', 'see', 'me', 'cry']}


# Tokenizer and Teacher Model (BERT)

In [9]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# teacher model
ID2LABEL = {0: "hate_speech", 1: "normal", 2: "offensive"}
LABEL2ID = {label: idx for idx, label in ID2LABEL.items()}
NUM_LABELS = len(ID2LABEL)

teacher_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = NUM_LABELS,
    id2label = ID2LABEL,
    label2id = LABEL2ID,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Preprocessing



- Use the tokenizer from the pretrained BERT model (bert-base-uncased) to tokenize input text.

- Use majority vote to select the most common label as the final label.

- Create PyTorch DataLoaders.

In [10]:
# from collections import Counter
# from transformers import DataCollatorWithPadding
# from torch.utils.data import DataLoader

# """
# Use DataCollatorWithPadding. Dynamic padding, save memory
# """


# def preprocess_function(example):
#     """
#     Preprocesses a dataset example by tokenizing text and applying majority vote on annotator labels.
#     """
#     texts = " ".join(example["post_tokens"])
#     output = tokenizer(texts, max_length=128, truncation=True)  # padding is False

#     labels = example["annotators"]["label"]
#     output["label"] = Counter(labels).most_common(1)[0][0]

#     return output


# def create_dataloaders(dataset, batch_size):
#     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#     train_dataset = dataset["train"].shuffle(seed=42)
#     eval_dataset = dataset["validation"].shuffle(seed=42)
#     test_dataset = dataset["test"].shuffle(seed=42)

#     train_loader = DataLoader(
#         train_dataset,
#         batch_size=batch_size,
#         shuffle=True,
#         collate_fn=data_collator
#     )

#     eval_loader = DataLoader(
#         eval_dataset,
#         batch_size=batch_size,
#         collate_fn=data_collator
#     )

#     test_loader = DataLoader(
#         test_dataset,
#         batch_size=batch_size,
#         collate_fn=data_collator
#     )
#     return train_loader, eval_loader, test_loader


# dataset_preprocessed = dataset.map(preprocess_function)  # batched is False
# dataset_preprocessed = dataset_preprocessed.remove_columns(["id", "annotators", "rationales", "post_tokens"])

In [11]:
from collections import Counter
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

"""
Use DataCollatorWithPadding. Dynamic padding, save memory
"""


def preprocess_function(examples):
    """
    Preprocesses a batch of examples. Tokenize text. Apply majority vote on annotator labels.
    """
    texts = [" ".join(tokens) for tokens in examples["post_tokens"]]
    output = tokenizer(texts, max_length=128, truncation=True)  # padding is False
    
    output["label"] = [Counter(annotator["label"]).most_common(1)[0][0] for annotator in examples["annotators"]]
    return output


def create_dataloaders(dataset, batch_size):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    train_dataset = dataset["train"].shuffle(seed=42)
    eval_dataset = dataset["validation"].shuffle(seed=42)
    test_dataset = dataset["test"].shuffle(seed=42)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=data_collator
    )

    eval_loader = DataLoader(
        eval_dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )
    return train_loader, eval_loader, test_loader


dataset_preprocessed = dataset.map(preprocess_function, batched=True)
dataset_preprocessed = dataset_preprocessed.remove_columns(["id", "annotators", "rationales", "post_tokens"])

In [12]:
# Example dataset_preprocessed
print(dataset_preprocessed)
print("-- Example --")
print(dataset_preprocessed['train'][0])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1924
    })
})
-- Example --
{'input_ids': [101, 1057, 2428, 2228, 1045, 2052, 2025, 2031, 2042, 15504, 2011, 18993, 7560, 2030, 5152, 2067, 1999, 2634, 2030, 7269, 1998, 1037, 9253, 6394, 2052, 9040, 2033, 2004, 2092, 2074, 2000, 2156, 2033, 5390, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 2}


In [13]:
# # Example train_loader
# train_loader, eval_loader, test_loader = create_dataloaders(dataset_preprocessed, batch_size=32)

# for batch in train_loader:
#     for k, v in batch.items():
#         print(f"{k}: {v.shape}")   
#     break

# Model




> The student has the same general architecture as BERT.
- The token-type embeddings and the pooler are removed while the number of layers is reduced by a factor of 2.
- Most of the operations used in the Transformer architecture—linear layer and layer normalisation—are highly optimized in modern linear algebra frameworks.
- Our investigations showed that variations on the last dimension of the tensor (hidden size dimension) have a smaller impact on computation efficiency (for a fixed parameters budget) than variations on other factors like the number of layers.
- Thus we focus on reducing the number of layers.
>
> — Sanh, Victor, et al. *DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter.* arXiv:1910.01108 (2019)

## Teacher Model (BERT)

In [14]:
# from transformers import AutoModelForSequenceClassification
# help(AutoModelForSequenceClassification.from_pretrained)

# from transformers import PretrainedConfig
# help(PretrainedConfig)

In [15]:
teacher_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
teacher_model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "hate_speech",
    "1": "normal",
    "2": "offensive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "hate_speech": 0,
    "normal": 1,
    "offensive": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.51.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## Student Model (DistilBERT)

- The student model has the same configuration as the teacher model, but with half the number of layers.
- Initialize the layers of student model. Copy one out of two encoder layers, and all other layers from the teacher model.





In [17]:
from transformers.models.bert.modeling_bert import BertConfig


def build_student_model(teacher_model):
    student_config = teacher_model.config.to_dict()
    student_config['num_hidden_layers'] //= 2  # half the number of hidden layer
    student_config = BertConfig.from_dict(student_config)
    model = type(teacher_model)(student_config)
    return model


def init_student_layers(teacher, student, use_layer = "odd"):
    """
    Initialize the layers of student model.
    Copy one out of two encoder layers, and all other layers from the teacher model.

    Params:
        teacher: The teacher model.
        student: The student model.
        use_layer: Whether to use odd {1, 3, 5, 7, 9, 11} or even {2, 4, 6, 8, 10, 12} layers from the teacher model.
    """

    # Copy one out of two encoder layers
    teacher_encoder_layers = teacher.bert.encoder.layer
    student_encoder_layers = student.bert.encoder.layer
    # print(len(teacher_encoder_layers), len(student_encoder_layers))  # 12, 6
    layer_indices = range(1, 12, 2) if use_layer == "odd" else range(0, 12, 2)  #  select odd or even layers
    for i, layer_idx in enumerate(layer_indices):
        student_encoder_layers[i].load_state_dict(teacher_encoder_layers[layer_idx].state_dict())

    # Copy all other layers
    student.bert.embeddings.load_state_dict(teacher.bert.embeddings.state_dict())
    student.bert.pooler.load_state_dict(teacher.bert.pooler.state_dict())
    student.classifier.load_state_dict(teacher.classifier.state_dict())

    return student


def count_parameters(model):
    return sum(param.numel() for param in model.parameters() if param.requires_grad)


# student_model = build_student_model(teacher_model)
# print(student_model)
# student_model = init_student_layers(teacher_model, student_model, use_layer="odd")
# print()
# print('Teacher parameters :', count_parameters(teacher_model))
# print('Student parameters :', count_parameters(student_model))
# print('Compression ratio:', count_parameters(student_model) / count_parameters(teacher_model))

# Loss Function

## Weighted Loss
Let $\alpha$, $\beta$ be weighting factors, $0 < \alpha, \beta < 1$ and $\alpha + \beta < 1$,
$$\mathcal{L} = \alpha \mathcal{L}_\text{cls} + \beta \mathcal{L}_\text{KD} + (1- \alpha - \beta)\mathcal{L}_{\text{cos}}$$

## Softmax
The softmax function converts a vector of raw scores (logits) $\pmb{z} = [z^{(1)}, z^{(2)}, \cdots, z^{(k)}]$ into a probability distribution over $k$ classes. The temperature parameter $T > 0$ controls the smoothness of the resulting probability distribution.
$$
\text{softmax}(\pmb{z}^{(j)}, T) = \frac{ \exp ( \pmb{z}^{(j)}  / T) }{\sum_{q=1}^{k} \exp ( \pmb{z}^{(q)} / T )}
$$
$T=1$, standard softmax;  
$T < 1$, sharper distribution, more confident predictions (the largest logits dominate);  
$T > 1$, smoother distribution, more uncertainty (probabilities are more uniform).

## Cross-Entropy Loss
Let $\pmb{p}$ be the target probability distribution and $\pmb{q}$ be the input probability distribution, both over $k$ classes. The Cross-Entropy measures the difference between these two distributions as:
$$
H(\pmb{p},\pmb{q})= - \sum_{j=1}^{k} \pmb{p}^{(j)} \log \pmb{q}^{(j)}
$$

## KL Divergence
$$D_{\mathrm{KL}}(\pmb{p} \,\|\, \pmb{q}) = \sum_{j=1}^{k} \pmb{p}^{(j)} \log \frac{\pmb{p}^{(j)}}{\pmb{q}^{(j)}} = \sum_{j=1}^{k} \pmb{p}^{(j)} (\log \pmb{p}^{(j)} - \log \pmb{q}^{(j)})$$

$$
H(\pmb{p}, \pmb{q}) = H(\pmb{p}) + D_{\mathrm{KL}}(\pmb{p} \,\|\, \pmb{q})
$$
Since $H(\pmb{p})$ is fixed, minimizing cross-entropy is equivalent to minimizing the KL divergence between $\pmb{p}$ and $\pmb{q}$.

## Classification Loss
$$\mathcal{L}_\text{cls} = H( \pmb{y}_\text{true}, \pmb{y}_\text{pred\_student} )$$

## Knowledge Distillation Loss
Both cross-entropy and KL divergence can be used for knowledge distillation loss, as they result in the same gradients and optimization updates.

KL divergence is often preferred, since it equals to zero when the student's output distribution exactly matches the teacher's. While cross-entropy includes an constant offset — the entropy $H(\pmb{p})$ of the teacher's output distribution. This constant does not affect the optimization process, but it introduces noise into the loss value across batches, making training curves less interpretable.

The multiplication by $T^2$ corrects for the gradient scaling effect introduced by the temperature during backpropagation.

$$\mathcal{L}_\text{KD} = H(\text{softmax}(\pmb{z}_{\text{teacher}}, T), \text{softmax}(\pmb{z}_{\text{student}}, T)) \cdot T^2 $$

$$\mathcal{L}_\text{KD} = D_{\mathrm{KL}}(\text{softmax}(\pmb{z}_{\text{teacher}}, T) \,\|\, \text{softmax}(\pmb{z}_{\text{student}}, T)) \cdot T^2 $$

In PyTorch, `F.cross_entropy(input, target)` expects logits as `input`, and class indices or class probabilities as `target`.   
`F.kl_div(input, target)` expects log-probabilities as `input`, and probabilities as `target`.

## Cosine Embedding Loss
$$\mathcal{L}_\text{cos} = 1 - \text{cosine\_similarity}(\pmb{z}_\text{student}, \pmb{z}_\text{teacher})$$

In [18]:
import torch.nn as nn
import torch.nn.functional as F

class KDLoss(nn.Module):
    """
    Compute the knowledge distillation loss
    """

    def __init__(self):
        super().__init__()

    def forward(self, logits_student, logits_teacher, T):
        """
        Params:
            logits_student: The logits of the student model.
            logits_teacher: The logits of the teacher model.
            T: The temperature parameter.
        """
        loss = nn.KLDivLoss(reduction='batchmean')(
            F.log_softmax(logits_student/T, dim=-1),
            F.softmax(logits_teacher/T, dim=-1)
        ) * T * T

        return loss


# # Loss functions
# criterion_cls = nn.CrossEntropyLoss()  # Classification Loss
# criterion_kd = KDLoss()  # Knowledge Distillation Loss
# criterion_cos = nn.CosineEmbeddingLoss()  # Cosine Embedding Loss

# Trainer

In [19]:
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(preds, labels):
    accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]    
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return accuracy, f1

In [20]:
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau


class KDTrainer:
    def __init__(self, teacher_model, student_model, train_loader, eval_loader, optimizer, scheduler=None, save_name="exp1"):
        self.teacher_model = teacher_model
        self.student_model = student_model
        self.train_loader = train_loader
        self.eval_loader = eval_loader
        self.optimizer = optimizer
        self.scheduler = scheduler

        self.criterion_cls = None
        self.criterion_kd = None
        self.criterion_cos = None
        self._init_criterions()

        self.save_name = save_name
        self.writer = SummaryWriter(log_dir=f"logs/{save_name}")  # Tensorboard writer

    def _init_criterions(self):
        self.criterion_cls = nn.CrossEntropyLoss()
        self.criterion_kd = KDLoss()
        self.criterion_cos = nn.CosineEmbeddingLoss()

    def train(self, num_epochs, T=1, alpha=0.5, beta=0.3):
        train_histories, eval_histories = [], []

        for epoch in range(num_epochs):
            print(f"\n-- Epoch {epoch + 1}/{num_epochs} --")

            train_history = self.train_one_epoch(T, alpha, beta)
            eval_history = self.evaluate()

            if self.scheduler is not None and isinstance(self.scheduler, ReduceLROnPlateau):
                self.scheduler.step(eval_history["eval_f1"])

            train_histories.append(train_history)
            eval_histories.append(eval_history)

            # print results
            print(f"Train\tLoss: {train_history['train_loss']:.4f}")
            print(f"\tcls: {train_history['train_loss_cls']:.4f}")
            print(f"\tkd: {train_history['train_loss_kd']:.4f}")
            print(f"\tcos: {train_history['train_loss_cos']:.4f}")
            print(f"Eval\tLoss: {eval_history['eval_loss']:.4f}")
            print(f"\tAccuracy: {eval_history['eval_accuracy']:.4f}")
            print(f"\tF1: {eval_history['eval_f1']:.4f}")

            # Log to TensorBoard
            self.writer.add_scalar("Loss/train", train_history['train_loss'], epoch)
            self.writer.add_scalar("Loss/train_cls", train_history['train_loss_cls'], epoch)
            self.writer.add_scalar("Loss/train_kd", train_history['train_loss_kd'], epoch)
            self.writer.add_scalar("Loss/train_cos", train_history['train_loss_cos'], epoch)
            self.writer.add_scalar("Loss/eval", eval_history['eval_loss'], epoch)
            self.writer.add_scalar("Metrics/accuracy", eval_history['eval_accuracy'], epoch)
            self.writer.add_scalar("Metrics/f1", eval_history['eval_f1'], epoch)

        self.writer.close()  # close Tensorboard writer
        
        return train_histories, eval_histories

    def train_one_epoch(self, T, alpha, beta):
        self.teacher_model.eval()
        self.student_model.train()

        total = 0
        total_loss, total_loss_cls, total_loss_kd, total_loss_cos = 0, 0, 0, 0

        for batch in tqdm(self.train_loader, desc="Training", leave=False):
            # move batch data to device
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            # clear grad
            self.optimizer.zero_grad()

            # forward
            outputs_student = self.student_model(**batch)
            with torch.no_grad():
                outputs_teacher = self.teacher_model(**batch)

            # compute loss
            loss_cls = self.criterion_cls(outputs_student.logits, batch["labels"])
            loss_kd = self.criterion_kd(outputs_student.logits, outputs_teacher.logits, T)
            loss_cos = self.criterion_cos(
                outputs_teacher.logits,
                outputs_student.logits,
                torch.ones(outputs_teacher.logits.size(0)).to(DEVICE)
            )
            loss = alpha * loss_cls + beta * loss_kd + (1.0 - alpha - beta) * loss_cos

            # backward
            loss.backward()
            self.optimizer.step()
            if self.scheduler is not None and not isinstance(self.scheduler, ReduceLROnPlateau):
                self.scheduler.step()

            # total loss
            batch_size = batch["labels"].size(0)
            total += batch_size
            total_loss += loss.item() * batch_size
            total_loss_cls += loss_cls.item() * batch_size
            total_loss_kd += loss_kd.item() * batch_size
            total_loss_cos += loss_cos.item() * batch_size

        # average loss
        history = {
            "train_loss": total_loss / total,
            "train_loss_cls": total_loss_cls / total,
            "train_loss_kd": total_loss_kd / total,
            "train_loss_cos": total_loss_cos / total,
        }

        return history

    def evaluate(self):
        self.student_model.eval()

        total = 0
        total_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in tqdm(self.eval_loader, desc="Evaluating", leave=False):
                # move batch data to device
                batch = {k: v.to(DEVICE) for k, v in batch.items()}

                # forward
                outputs = self.student_model(**batch)

                # loss
                loss = self.criterion_cls(outputs.logits, batch["labels"])

                # total loss
                batch_size = batch["labels"].size(0)
                total += batch_size
                total_loss += loss.item() * batch_size

                preds = outputs.logits.argmax(dim=-1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(batch["labels"].cpu().numpy())

        # average loss
        eval_loss = total_loss / total

        accuracy, f1 = compute_metrics(all_preds, all_labels)

        history = {
            "eval_loss": eval_loss,
            "eval_accuracy": accuracy,
            "eval_f1": f1
        }

        return history

    def save_model(self):
        save_path = f"models/{self.save_name}.pt"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        torch.save(self.student_model.state_dict(), save_path)
        print(f"Student model saved to: {save_path}")


# Train

In [21]:
import torch.optim as optim
from transformers import get_scheduler

LEARNING_RATE = 3e-5
NUM_EPOCHS = 3
T = 2
BATCH_SIZE = 32
ALPHA = 0.7
BETA = 0.2
save_name = f"A{ALPHA:.2f}_T{T}_LR{LEARNING_RATE}_NE{NUM_EPOCHS}"

# Create dataloaders
train_loader, eval_loader, test_loader = create_dataloaders(dataset_preprocessed, batch_size=BATCH_SIZE)

# Init student model
student_model = build_student_model(teacher_model)
student_model = init_student_layers(teacher_model, student_model, use_layer="odd")

# To device
teacher_model = teacher_model.to(DEVICE)
student_model = student_model.to(DEVICE)

print('Teacher parameters:', count_parameters(teacher_model))
print('Student parameters:', count_parameters(student_model))
compression_ratio = count_parameters(student_model) / count_parameters(teacher_model)
print(f'Compression ratio: {compression_ratio:.4f}')

optimizer = optim.AdamW(student_model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
scheduler = get_scheduler(
    name="cosine",  # linear
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps = NUM_EPOCHS * len(train_loader)
)
# scheduler = ReduceLROnPlateau(
#     optimizer,
#     mode="max",
#     factor=0.5,
#     patience=1,
#     threshold=0.001,
#     verbose=True
# )

kdtrainer = KDTrainer(
    teacher_model=teacher_model,
    student_model=student_model,
    train_loader=train_loader,
    eval_loader=eval_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    save_name=save_name
)

_, _ = kdtrainer.train(num_epochs=NUM_EPOCHS, T=T, alpha=ALPHA, beta=BETA)

kdtrainer.save_model()

Teacher parameters: 109484547
Student parameters: 66957315
Compression ratio: 0.6116

-- Epoch 1/3 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6359
	cls: 0.8352
	kd: 0.1776
	cos: 0.1578
Eval	Loss: 0.7695
	Accuracy: 0.6727
	F1: 0.6693

-- Epoch 2/3 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5534
	cls: 0.6896
	kd: 0.2816
	cos: 0.1436
Eval	Loss: 0.7476
	Accuracy: 0.6774
	F1: 0.6701

-- Epoch 3/3 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5037
	cls: 0.6016
	kd: 0.3400
	cos: 0.1459
Eval	Loss: 0.7373
	Accuracy: 0.6821
	F1: 0.6731
Student model saved to: models/A0.70_T2_LR3e-05_NE3.pt


In [None]:
# !tensorboard --logdir=logs

# Grid Search

In [21]:
ALPHA_BETAS = [
    (1/3, 1/3),
    (0.5, 0.3),
    (0.7, 0.2)
]
TEMPERATURES = [1, 2]
LEARNING_RATES = [5e-5, 3e-5, 1e-5]
NUM_EPOCHS = [5]
BATCH_SIZE = 32

In [22]:
import torch.optim as optim
from transformers import get_scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from itertools import product
import pandas as pd

configs = []

for alpha_beta, T, lr, num_epochs in product(ALPHA_BETAS, TEMPERATURES, LEARNING_RATES, NUM_EPOCHS):
    alpha, beta = alpha_beta
    print(f"\nRunning config: alpha={alpha:.2f}, beta={beta:.2f}, T={T}, lr={lr}, num_epochs={num_epochs}")
    save_name = f"A{alpha:.2f}_T{T}_LR{lr}_NE{num_epochs}"
    
    # Create dataloaders
    train_loader, eval_loader, test_loader = create_dataloaders(dataset_preprocessed, batch_size=BATCH_SIZE)
    
    # Init student model
    student_model = build_student_model(teacher_model)
    student_model = init_student_layers(teacher_model, student_model, use_layer="odd")
    
    # To device
    teacher_model = teacher_model.to(DEVICE)
    student_model = student_model.to(DEVICE)

    optimizer = optim.AdamW(student_model.parameters(), lr=lr， weight_decay=0.01)
    # scheduler = get_scheduler(
    #     name="linear",
    #     optimizer=optimizer,
    #     num_warmup_steps=0,
    #     num_training_steps=num_epochs * len(train_loader)
    # )
    scheduler = ReduceLROnPlateau(
        optimizer,
        mode="max",
        factor=0.5,
        patience=1,
        threshold=0.001,
        verbose=True
    )

    kdtrainer = KDTrainer(
        teacher_model=teacher_model,
        student_model=student_model,
        train_loader=train_loader,
        eval_loader=eval_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        save_name=save_name
    )

    _, eval_metrics = kdtrainer.train(
        num_epochs=num_epochs,
        T=T,
        alpha=alpha,
        beta=beta
    )
    
    kdtrainer.save_model()
    
    f1 = eval_metrics[-1]["eval_f1"]
    configs.append({
        "alpha": alpha,
        "beta": beta,
        "T": T,
        "lr": lr,
        "num_epochs": num_epochs,
        "eval_f1": f1
    })
    
    df = pd.DataFrame(configs)
    df.to_csv("grid_results.csv", index=False)




Running config: alpha=0.33, beta=0.33, T=1, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3593
	cls: 0.9135
	kd: 0.0637
	cos: 0.1006
Eval	Loss: 0.8581
	Accuracy: 0.6472
	F1: 0.6495

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3324
	cls: 0.8041
	kd: 0.0998
	cos: 0.0931
Eval	Loss: 0.8216
	Accuracy: 0.6733
	F1: 0.6605

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3098
	cls: 0.7062
	kd: 0.1328
	cos: 0.0903
Eval	Loss: 0.8256
	Accuracy: 0.6571
	F1: 0.6499

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2875
	cls: 0.6074
	kd: 0.1656
	cos: 0.0894
Eval	Loss: 0.8222
	Accuracy: 0.6696
	F1: 0.6481

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2675
	cls: 0.5239
	kd: 0.1915
	cos: 0.0871
Eval	Loss: 0.8311
	Accuracy: 0.6550
	F1: 0.6477
Student model saved to: models/A0.33_T1_LR5e-05_NE5.pt

Running config: alpha=0.33, beta=0.33, T=1, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3612
	cls: 0.9298
	kd: 0.0567
	cos: 0.0971
Eval	Loss: 0.8408
	Accuracy: 0.6795
	F1: 0.6648

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3343
	cls: 0.8133
	kd: 0.0954
	cos: 0.0941
Eval	Loss: 0.8268
	Accuracy: 0.6790
	F1: 0.6749

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3161
	cls: 0.7357
	kd: 0.1214
	cos: 0.0911
Eval	Loss: 0.8089
	Accuracy: 0.6800
	F1: 0.6651

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2950
	cls: 0.6429
	kd: 0.1519
	cos: 0.0904
Eval	Loss: 0.8260
	Accuracy: 0.6582
	F1: 0.6460

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2758
	cls: 0.5595
	kd: 0.1784
	cos: 0.0895
Eval	Loss: 0.8291
	Accuracy: 0.6639
	F1: 0.6609
Student model saved to: models/A0.33_T1_LR3e-05_NE5.pt

Running config: alpha=0.33, beta=0.33, T=1, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3736
	cls: 0.9963
	kd: 0.0322
	cos: 0.0922
Eval	Loss: 0.8855
	Accuracy: 0.6467
	F1: 0.6472

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3471
	cls: 0.8712
	kd: 0.0726
	cos: 0.0975
Eval	Loss: 0.8380
	Accuracy: 0.6785
	F1: 0.6628

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3361
	cls: 0.8229
	kd: 0.0895
	cos: 0.0958
Eval	Loss: 0.8314
	Accuracy: 0.6743
	F1: 0.6669

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3269
	cls: 0.7842
	kd: 0.1030
	cos: 0.0935
Eval	Loss: 0.8214
	Accuracy: 0.6774
	F1: 0.6648

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3165
	cls: 0.7388
	kd: 0.1177
	cos: 0.0930
Eval	Loss: 0.8091
	Accuracy: 0.6795
	F1: 0.6699
Student model saved to: models/A0.33_T1_LR1e-05_NE5.pt

Running config: alpha=0.33, beta=0.33, T=2, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3612
	cls: 0.9230
	kd: 0.0608
	cos: 0.0997
Eval	Loss: 0.8480
	Accuracy: 0.6738
	F1: 0.6651

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3338
	cls: 0.8101
	kd: 0.0977
	cos: 0.0936
Eval	Loss: 0.8458
	Accuracy: 0.6540
	F1: 0.6598

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3109
	cls: 0.7136
	kd: 0.1284
	cos: 0.0906
Eval	Loss: 0.8332
	Accuracy: 0.6639
	F1: 0.6653

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2827
	cls: 0.5926
	kd: 0.1660
	cos: 0.0896
Eval	Loss: 0.8267
	Accuracy: 0.6623
	F1: 0.6597

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2710
	cls: 0.5415
	kd: 0.1828
	cos: 0.0886
Eval	Loss: 0.8404
	Accuracy: 0.6618
	F1: 0.6609
Student model saved to: models/A0.33_T2_LR5e-05_NE5.pt

Running config: alpha=0.33, beta=0.33, T=2, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3632
	cls: 0.9367
	kd: 0.0547
	cos: 0.0981
Eval	Loss: 0.8706
	Accuracy: 0.6514
	F1: 0.6556

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3361
	cls: 0.8229
	kd: 0.0913
	cos: 0.0940
Eval	Loss: 0.8475
	Accuracy: 0.6660
	F1: 0.6648

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3176
	cls: 0.7428
	kd: 0.1181
	cos: 0.0919
Eval	Loss: 0.8306
	Accuracy: 0.6634
	F1: 0.6625

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2956
	cls: 0.6477
	kd: 0.1480
	cos: 0.0911
Eval	Loss: 0.8342
	Accuracy: 0.6535
	F1: 0.6477

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.2762
	cls: 0.5639
	kd: 0.1742
	cos: 0.0906
Eval	Loss: 0.8432
	Accuracy: 0.6478
	F1: 0.6441
Student model saved to: models/A0.33_T2_LR3e-05_NE5.pt

Running config: alpha=0.33, beta=0.33, T=2, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3746
	cls: 1.0018
	kd: 0.0305
	cos: 0.0915
Eval	Loss: 0.8795
	Accuracy: 0.6374
	F1: 0.6266

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3480
	cls: 0.8757
	kd: 0.0707
	cos: 0.0977
Eval	Loss: 0.8486
	Accuracy: 0.6785
	F1: 0.6698

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3357
	cls: 0.8230
	kd: 0.0884
	cos: 0.0955
Eval	Loss: 0.8141
	Accuracy: 0.6774
	F1: 0.6698

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3267
	cls: 0.7828
	kd: 0.1028
	cos: 0.0946
Eval	Loss: 0.8369
	Accuracy: 0.6660
	F1: 0.6676

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3149
	cls: 0.7344
	kd: 0.1163
	cos: 0.0940
Eval	Loss: 0.8118
	Accuracy: 0.6707
	F1: 0.6622
Student model saved to: models/A0.33_T2_LR1e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=1, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4923
	cls: 0.8692
	kd: 0.1113
	cos: 0.1216
Eval	Loss: 0.8005
	Accuracy: 0.6717
	F1: 0.6622

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4458
	cls: 0.7435
	kd: 0.1716
	cos: 0.1129
Eval	Loss: 0.7708
	Accuracy: 0.6764
	F1: 0.6719

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4022
	cls: 0.6227
	kd: 0.2283
	cos: 0.1117
Eval	Loss: 0.7899
	Accuracy: 0.6686
	F1: 0.6635

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3578
	cls: 0.4983
	kd: 0.2860
	cos: 0.1142
Eval	Loss: 0.8078
	Accuracy: 0.6582
	F1: 0.6468

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3224
	cls: 0.4001
	kd: 0.3317
	cos: 0.1143
Eval	Loss: 0.8210
	Accuracy: 0.6592
	F1: 0.6542
Student model saved to: models/A0.50_T1_LR5e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=1, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4924
	cls: 0.8722
	kd: 0.1063
	cos: 0.1223
Eval	Loss: 0.8014
	Accuracy: 0.6764
	F1: 0.6595

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4483
	cls: 0.7511
	kd: 0.1650
	cos: 0.1164
Eval	Loss: 0.7759
	Accuracy: 0.6759
	F1: 0.6558

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4142
	cls: 0.6557
	kd: 0.2098
	cos: 0.1168
Eval	Loss: 0.7676
	Accuracy: 0.6779
	F1: 0.6680

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3732
	cls: 0.5406
	kd: 0.2649
	cos: 0.1174
Eval	Loss: 0.7921
	Accuracy: 0.6748
	F1: 0.6674

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3427
	cls: 0.4543
	kd: 0.3062
	cos: 0.1184
Eval	Loss: 0.8173
	Accuracy: 0.6498
	F1: 0.6408
Student model saved to: models/A0.50_T1_LR3e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=1, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5122
	cls: 0.9338
	kd: 0.0704
	cos: 0.1206
Eval	Loss: 0.8112
	Accuracy: 0.6665
	F1: 0.6571

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4634
	cls: 0.7940
	kd: 0.1396
	cos: 0.1223
Eval	Loss: 0.7936
	Accuracy: 0.6701
	F1: 0.6573

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4451
	cls: 0.7428
	kd: 0.1655
	cos: 0.1201
Eval	Loss: 0.8103
	Accuracy: 0.6592
	F1: 0.6535

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4258
	cls: 0.6916
	kd: 0.1863
	cos: 0.1205
Eval	Loss: 0.7730
	Accuracy: 0.6738
	F1: 0.6698

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4157
	cls: 0.6614
	kd: 0.2025
	cos: 0.1212
Eval	Loss: 0.7738
	Accuracy: 0.6696
	F1: 0.6633
Student model saved to: models/A0.50_T1_LR1e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=2, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4913
	cls: 0.8678
	kd: 0.1103
	cos: 0.1215
Eval	Loss: 0.7864
	Accuracy: 0.6722
	F1: 0.6647

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4439
	cls: 0.7400
	kd: 0.1699
	cos: 0.1145
Eval	Loss: 0.7809
	Accuracy: 0.6649
	F1: 0.6525

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4026
	cls: 0.6255
	kd: 0.2231
	cos: 0.1147
Eval	Loss: 0.7948
	Accuracy: 0.6774
	F1: 0.6727

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3595
	cls: 0.5071
	kd: 0.2759
	cos: 0.1157
Eval	Loss: 0.8215
	Accuracy: 0.6561
	F1: 0.6509

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3359
	cls: 0.4409
	kd: 0.3074
	cos: 0.1161
Eval	Loss: 0.8187
	Accuracy: 0.6602
	F1: 0.6573
Student model saved to: models/A0.50_T2_LR5e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=2, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4942
	cls: 0.8772
	kd: 0.1031
	cos: 0.1233
Eval	Loss: 0.8114
	Accuracy: 0.6774
	F1: 0.6666

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4482
	cls: 0.7515
	kd: 0.1626
	cos: 0.1183
Eval	Loss: 0.7804
	Accuracy: 0.6774
	F1: 0.6682

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4137
	cls: 0.6559
	kd: 0.2069
	cos: 0.1183
Eval	Loss: 0.7659
	Accuracy: 0.6811
	F1: 0.6680

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3729
	cls: 0.5435
	kd: 0.2573
	cos: 0.1199
Eval	Loss: 0.7970
	Accuracy: 0.6779
	F1: 0.6700

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3420
	cls: 0.4574
	kd: 0.2971
	cos: 0.1209
Eval	Loss: 0.8094
	Accuracy: 0.6655
	F1: 0.6527
Student model saved to: models/A0.50_T2_LR3e-05_NE5.pt

Running config: alpha=0.50, beta=0.30, T=2, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5124
	cls: 0.9330
	kd: 0.0711
	cos: 0.1231
Eval	Loss: 0.8178
	Accuracy: 0.6644
	F1: 0.6585

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4650
	cls: 0.7992
	kd: 0.1356
	cos: 0.1234
Eval	Loss: 0.7924
	Accuracy: 0.6753
	F1: 0.6636

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4459
	cls: 0.7462
	kd: 0.1612
	cos: 0.1222
Eval	Loss: 0.7722
	Accuracy: 0.6769
	F1: 0.6618

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4284
	cls: 0.6972
	kd: 0.1845
	cos: 0.1221
Eval	Loss: 0.7774
	Accuracy: 0.6696
	F1: 0.6632

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4050
	cls: 0.6361
	kd: 0.2083
	cos: 0.1225
Eval	Loss: 0.7739
	Accuracy: 0.6670
	F1: 0.6623
Student model saved to: models/A0.50_T2_LR1e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=1, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6320
	cls: 0.8272
	kd: 0.1900
	cos: 0.1498
Eval	Loss: 0.7704
	Accuracy: 0.6764
	F1: 0.6469

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5540
	cls: 0.6874
	kd: 0.2971
	cos: 0.1345
Eval	Loss: 0.7522
	Accuracy: 0.6805
	F1: 0.6694

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4720
	cls: 0.5391
	kd: 0.4049
	cos: 0.1368
Eval	Loss: 0.7855
	Accuracy: 0.6701
	F1: 0.6540

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3932
	cls: 0.3958
	kd: 0.5115
	cos: 0.1385
Eval	Loss: 0.7862
	Accuracy: 0.6696
	F1: 0.6579

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3284
	cls: 0.2781
	kd: 0.5987
	cos: 0.1397
Eval	Loss: 0.8351
	Accuracy: 0.6655
	F1: 0.6540
Student model saved to: models/A0.70_T1_LR5e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=1, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6382
	cls: 0.8382
	kd: 0.1788
	cos: 0.1573
Eval	Loss: 0.7599
	Accuracy: 0.6847
	F1: 0.6773

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5595
	cls: 0.6979
	kd: 0.2840
	cos: 0.1421
Eval	Loss: 0.7518
	Accuracy: 0.6889
	F1: 0.6837

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4956
	cls: 0.5825
	kd: 0.3683
	cos: 0.1423
Eval	Loss: 0.7421
	Accuracy: 0.6816
	F1: 0.6796

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4254
	cls: 0.4534
	kd: 0.4675
	cos: 0.1452
Eval	Loss: 0.7802
	Accuracy: 0.6602
	F1: 0.6531

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3539
	cls: 0.3241
	kd: 0.5610
	cos: 0.1479
Eval	Loss: 0.8173
	Accuracy: 0.6655
	F1: 0.6613
Student model saved to: models/A0.70_T1_LR3e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=1, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6684
	cls: 0.8953
	kd: 0.1244
	cos: 0.1684
Eval	Loss: 0.7821
	Accuracy: 0.6602
	F1: 0.6451

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5872
	cls: 0.7488
	kd: 0.2378
	cos: 0.1553
Eval	Loss: 0.7494
	Accuracy: 0.6852
	F1: 0.6703

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5532
	cls: 0.6884
	kd: 0.2825
	cos: 0.1489
Eval	Loss: 0.7297
	Accuracy: 0.6878
	F1: 0.6737

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5223
	cls: 0.6314
	kd: 0.3273
	cos: 0.1484
Eval	Loss: 0.7377
	Accuracy: 0.6811
	F1: 0.6661

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4847
	cls: 0.5629
	kd: 0.3780
	cos: 0.1513
Eval	Loss: 0.7748
	Accuracy: 0.6566
	F1: 0.6475
Student model saved to: models/A0.70_T1_LR1e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=2, lr=5e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6319
	cls: 0.8268
	kd: 0.1894
	cos: 0.1529
Eval	Loss: 0.7787
	Accuracy: 0.6717
	F1: 0.6659

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5551
	cls: 0.6903
	kd: 0.2910
	cos: 0.1368
Eval	Loss: 0.7431
	Accuracy: 0.6847
	F1: 0.6558

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4771
	cls: 0.5516
	kd: 0.3866
	cos: 0.1369
Eval	Loss: 0.7735
	Accuracy: 0.6759
	F1: 0.6552

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3790
	cls: 0.3761
	kd: 0.5087
	cos: 0.1403
Eval	Loss: 0.8059
	Accuracy: 0.6639
	F1: 0.6528

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3382
	cls: 0.3019
	kd: 0.5641
	cos: 0.1401
Eval	Loss: 0.8586
	Accuracy: 0.6556
	F1: 0.6515
Student model saved to: models/A0.70_T2_LR5e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=2, lr=3e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6373
	cls: 0.8376
	kd: 0.1764
	cos: 0.1572
Eval	Loss: 0.7429
	Accuracy: 0.6816
	F1: 0.6696

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5591
	cls: 0.6982
	kd: 0.2804
	cos: 0.1431
Eval	Loss: 0.7410
	Accuracy: 0.6774
	F1: 0.6663

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4959
	cls: 0.5845
	kd: 0.3623
	cos: 0.1430
Eval	Loss: 0.7773
	Accuracy: 0.6571
	F1: 0.6478

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4072
	cls: 0.4256
	kd: 0.4723
	cos: 0.1483
Eval	Loss: 0.7797
	Accuracy: 0.6733
	F1: 0.6601

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.3655
	cls: 0.3492
	kd: 0.5301
	cos: 0.1504
Eval	Loss: 0.8050
	Accuracy: 0.6707
	F1: 0.6592
Student model saved to: models/A0.70_T2_LR3e-05_NE5.pt

Running config: alpha=0.70, beta=0.20, T=2, lr=1e-05, num_epochs=5

-- Epoch 1/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.6695
	cls: 0.8965
	kd: 0.1251
	cos: 0.1693
Eval	Loss: 0.7891
	Accuracy: 0.6649
	F1: 0.6460

-- Epoch 2/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5883
	cls: 0.7512
	kd: 0.2347
	cos: 0.1548
Eval	Loss: 0.7717
	Accuracy: 0.6639
	F1: 0.6604

-- Epoch 3/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5540
	cls: 0.6903
	kd: 0.2793
	cos: 0.1492
Eval	Loss: 0.7371
	Accuracy: 0.6811
	F1: 0.6717

-- Epoch 4/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.5221
	cls: 0.6321
	kd: 0.3235
	cos: 0.1499
Eval	Loss: 0.7430
	Accuracy: 0.6785
	F1: 0.6714

-- Epoch 5/5 --


Training:   0%|          | 0/481 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/61 [00:00<?, ?it/s]

Train	Loss: 0.4884
	cls: 0.5719
	kd: 0.3651
	cos: 0.1511
Eval	Loss: 0.7436
	Accuracy: 0.6779
	F1: 0.6681
Student model saved to: models/A0.70_T2_LR1e-05_NE5.pt
