# Knowledge Distillation (KD)

## Installs dependencies

In [1]:
%pip install --upgrade pip
%pip install torch transformers datasets accelerate evaluate scikit-learn requests==2.31.0

Collecting requests==2.31.0
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
INFO: pip is looking at multiple versions of datasets to determine which version is compatible with other requirements. This could take a while.
Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests, datasets
  Attempting uninstall: requests
    Found existing installation: requests 2.32.3
    Uninstalling requests-2.32.3:
      Successfully uninstalled requests-2.32.3
  Attempting uninstall: datasets
    Found 

## Imports libraries

In [2]:
import evaluate
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import (
	load_dataset,
	load_metric,
)
from transformers import (
	DefaultDataCollator,
	TrainingArguments,
	Trainer,
	AutoTokenizer,
	AutoConfig,
	AutoModelForSequenceClassification,
)


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Load dataset

- `plus` configuration refers to the subset that contains the out-of-scope training examples.
- clinc_oos is a dataset for intent classification that contains 150 examples for each of the 150 classes.
- intent means the intention of the user when they type a message. For example, the intent of the message "What is the weather today?" is to get the weather forecast.

In [4]:
dataset = load_dataset("clinc_oos", "plus")
dataset

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [5]:
sample = dataset["train"][0]
sample

{'text': 'what expression would i use to say i love you if i were an italian',
 'intent': 61}

In [6]:
intents = dataset["train"].features["intent"]
intent = intents.int2str(sample["intent"])
intent

'translate'

In [7]:
num_labels = intents.num_classes
num_labels

151

## Load models

### Load teacher model

In [8]:
teacher_card = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher_model = (
	AutoModelForSequenceClassification.from_pretrained(
		teacher_card,
		num_labels=num_labels,
	)
    .to(device)

)

### Load student model and tokenizer

In [9]:
id2label = intents.names
label2id = intents.str2int
student_card = "distilbert-base-uncased"
student_config = AutoConfig.from_pretrained(
	student_card,
	num_labels=num_labels,
	# id2label=id2label,
	# label2id=label2id
)
student_model = AutoModelForSequenceClassification.from_pretrained(
	student_card,
	config=student_config
).to(device)
student_tokenizer = AutoTokenizer.from_pretrained(student_card)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize the data

In [10]:
def tokenize_fn(batch):
  return student_tokenizer(batch["text"], truncation=True)

In [11]:
tokenized_dataset = dataset.map(
	tokenize_fn,
	batched=True,
	remove_columns=["text"]
)

# We will remove text column as we don't need it
# We will also rename the intent column to labels so it can be automatically detected by the trainer.
tokenized_dataset = tokenized_dataset.rename_column("intent", "labels")
tokenized_dataset


Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5500
    })
})

------------------------------------------------------------------------

## Define custom KD Trainer

**Create new Loss function**

We will use the subclass Trainer and overriding the compute_loss() method to include the knowledge distillation loss term LKD.

LKD (Loss Knowledge Distillation): is the KL (Kullback-Leibler) divergence between the teacher and student model predictions.

KL divergence is a measure of how student probability distribution differs from a teacher, expected probability distribution.

`KL (P||Q) = Σ P(x) * log(P(x)/Q(x))`



In [12]:
class KnowledgeDistillationTrainer(Trainer):
	def __init__(
		self,
		teacher_model=None,
		student_model=None,
		temperature=None,
		lambda_param=None,
		*args,
		**kwargs,
	):
		super().__init__(model=student_model, *args, **kwargs)
		self.teacher_model = teacher_model
		self.student_model = student_model
		self.temperature = temperature
		self.lambda_param = lambda_param
		self.loss_function = nn.KLDivLoss(reduction="batchmean")
		device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
		self.teacher_model.to(device)
		self.teacher_model.eval()


	def _calculate_weighted_loss(self, loss_ce, distillation_loss):
		return self.args.alpha * loss_ce + (1. - self.args.alpha) * distillation_loss

	def compute_loss(self, student_model, inputs, return_outputs=False):
		# Obtain logits from student model
		student_output = self.student_model(**inputs)
		student_logits = student_output.logits

		# Obtain logits from teacher model
		teacher_output = self.teacher_model(**inputs)
		teacher_logits = teacher_output.logits

		# Extract cross-entropy from student
		loss_ce = student_output.loss

		# Compute soft targets for teacher and student (etiquetas suaves)
		soft_teacher = F.softmax(teacher_logits / self.temperature, dim=-1)
		soft_student = F.log_softmax(student_logits / self.temperature, dim=-1)

		# Compute KL divergence loss by Softening probabilities
		distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)

		# Return weighted student loss
		# loss = self._calculate_weighted_loss(loss_ce, distillation_loss)
		loss = (1. - self.lambda_param) * loss_ce + self.lambda_param * distillation_loss
		return (loss, student_output) if return_outputs else loss


## Training

### Set up metrics

Usamos la función np.argmax() para encontrar la predicción de clase más confiable y compararla contra la etiqueta de verdad fundamental.

In [13]:
# accuracy_score = load_metric("accuracy")
accuracy_score = evaluate.load("accuracy")

def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return accuracy_score.compute(predictions=predictions, references=labels)

### Set up Training Arguments

In [21]:
batch_size = 48
finetuned_student = "distilbert-base-uncased-finetuned-clinc-student"

student_training_args = TrainingArguments(
    output_dir=f"./{finetuned_student}",
    overwrite_output_dir=True,
	evaluation_strategy = "epoch",
	learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
	weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
	fp16=True,
    load_best_model_at_end=True,
)



### Set up trainer and execute

In [15]:
trainer = KnowledgeDistillationTrainer(
	student_model=student_model,
    teacher_model=teacher_model,
	args=student_training_args,
    train_dataset=tokenized_dataset['train'],
	eval_dataset=tokenized_dataset['validation'],
	# data_collator=DefaultDataCollator(),
	tokenizer=student_tokenizer,
    compute_metrics=compute_metrics,
	temperature=2.0,
	lambda_param=0.5,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.911391,0.707742
2,2.258900,1.096824,0.852903
3,2.258900,0.715859,0.900968
4,1.025400,0.567396,0.919677
5,0.607900,0.527229,0.920645


TrainOutput(global_step=1590, training_loss=1.2544941116428976, metrics={'train_runtime': 376.424, 'train_samples_per_second': 202.564, 'train_steps_per_second': 4.224, 'total_flos': 414689637990180.0, 'train_loss': 1.2544941116428976, 'epoch': 5.0})

## Evaluate
We can evaluate the model on the test set.

trainer.evaluate(tokenized_dataset['test'])

## Lets compare Teacher and Student Model
We will compare the two models based on size and inference time

Saving Teacher and Student model and then computing model's size in MB

In [24]:
# my_fine_tuned_model = "/content/drive/MyDrive/models/fine-tuned-sms-generation-model"


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def save_teacher_model():
  teacher_model.save_pretrained("teacher_model")

def save_student_model():
  student_model.save_pretrained(finetuned_student)
  student_tokenizer.save_pretrained(finetuned_student)
  student_model.to(device)


In [26]:
save_teacher_model()
save_student_model()

In [27]:
from transformers import AutoConfig, AutoModelForSequenceClassification
import os

def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [28]:
teacher_model_parameters = compute_parameters(model_path="/content/teacher_model")
print("Teacher Model: ", teacher_model_parameters)

Teacher Model:  109598359


In [30]:
student_model_parameters = compute_parameters(model_path="/content/distilbert-base-uncased-finetuned-clinc-student")
print("Student Model: ", student_model_parameters)

Student Model:  67069591


In [31]:
decrease = (student_model_parameters-teacher_model_parameters)/teacher_model_parameters
print(decrease*100)

-38.804201438818986


In [33]:
!ls /content/distilbert-base-uncased-finetuned-clinc-student -al --block-size=MB

total 270MB
drwxr-xr-x 8 root root   1MB Jul 18 19:05 .
drwxr-xr-x 1 root root   1MB Jul 18 18:14 ..
drwxr-xr-x 2 root root   1MB Jul 18 18:21 checkpoint-1272
drwxr-xr-x 2 root root   1MB Jul 18 18:22 checkpoint-1590
drwxr-xr-x 2 root root   1MB Jul 18 18:12 checkpoint-318
drwxr-xr-x 2 root root   1MB Jul 18 18:13 checkpoint-636
drwxr-xr-x 2 root root   1MB Jul 18 18:14 checkpoint-954
-rw-r--r-- 1 root root   1MB Jul 18 19:05 config.json
-rw-r--r-- 1 root root 269MB Jul 18 19:05 model.safetensors
drwxr-xr-x 4 root root   1MB Jul 18 18:16 runs
-rw-r--r-- 1 root root   1MB Jul 18 19:05 special_tokens_map.json
-rw-r--r-- 1 root root   1MB Jul 18 19:05 tokenizer_config.json
-rw-r--r-- 1 root root   1MB Jul 18 19:05 tokenizer.json
-rw-r--r-- 1 root root   1MB Jul 18 19:05 vocab.txt


In [34]:
!ls /content/teacher_model -al --block-size=MB

total 439MB
drwxr-xr-x 2 root root   1MB Jul 18 18:14 .
drwxr-xr-x 1 root root   1MB Jul 18 18:14 ..
-rw-r--r-- 1 root root   1MB Jul 18 19:05 config.json
-rw-r--r-- 1 root root 439MB Jul 18 19:05 model.safetensors


In [35]:
print(dataset['train']['text'][101])
print(dataset['train']['intent'][101])


complete a transaction from savings to checking of $20000
133


# we will take average times of multiple inferences on same input

In [36]:
#Lets warmup first
from transformers import pipeline
import time

pipe = pipeline("text-classification", model="/content/teacher_model", tokenizer='bert-base-uncased')

sample_input = dataset['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_teacher_model = time.time()-start
print("Total time to process 100 requests for Teacher Model: ",total_time_teacher_model)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Total time to process 100 requests for Teacher Model:  2.745138645172119


In [37]:
pipe = pipeline("text-classification", model="/content/distilbert-base-uncased-finetuned-clinc-student", tokenizer="distilbert-base-uncased")

sample_input = dataset['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_student_model = time.time()-start

print("Total time to process 100 requests for Student Model: ",total_time_student_model)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Total time to process 100 requests for Student Model:  2.4203176498413086


In [38]:
decrease_in_time = (total_time_teacher_model-total_time_student_model)/total_time_teacher_model
print(decrease_in_time*100)

11.832589800230087
