In [1]:
!pip install transformers datasets transformers[torch] accelerate>=0.20.11

In [25]:
import os
import time

from torchsummary import summary
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from tqdm.auto import tqdm
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import swin_t, swin_b

from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoConfig, AutoModelForImageClassification, pipeline, AutoFeatureExtractor

#### Функции для замера моделей

In [3]:
def get_model_size(model):
  dummy_input = torch.randn(1, 3, 224, 224)
  size = sum(torch.nn.utils.parameters_to_vector(model.parameters()).size() * 4) / (1024 * 1024)
  return size


def print_size_of_model(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size=os.path.getsize("temp.p")
    print("model: ",label,' \t','Size (MB):', round(size/1024/1024, 2))
    os.remove('temp.p')


def performance_test(model, criterion, test_loader, device):
    model.to(device)
    model.eval()
    test_loss = 0
    total = 0
    correct_top1=0
    correct_top5=0
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            test_loss += criterion(outputs, labels).item()

            # Top-1 и Top-5 accuracy
            _, predicted = outputs.topk(5, dim=1)
            total += labels.size(0)
            correct_top1 += predicted[:, 0].eq(labels).sum().item()
            correct_top5 += predicted.eq(labels.view(-1, 1)).sum().item()

        test_loss /= len(test_loader)
        top1_accuracy = correct_top1 / total
        top5_accuracy = correct_top5 / total

        print(f'Loss: {test_loss:.2f}, acc@1: {top1_accuracy}, acc@5: {top5_accuracy}')

## Создадим кастомный Trainer для дистилляции знаний

---



1. Определим гиперпараметры α and T

α - на сколько больше мы хотим ориентироваться на предсказания модели-учителя, и меньше на модель-студента  
T - как сильно должно быть сглажено распределение вероятностей классов

2. В качестве модели-учителя будем использовать BERT-base.

3. Новая лосс-функция будет совмещать в себе кросс-энтропию и лосс дистилляции

Чтобы добавить наши гиперпараметры достаточно создать класс TrainingArguments и включить их в него как атрибуты

In [4]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    super().__init__(*args, **kwargs)

    self.alpha = alpha
    self.temperature = temperature

# Напишем лосс-функцию для дистилляции знаний
Создадим наследника класса Trainer и перепишем compute_loss()



In [27]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, feature_extractor=None, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model
    self.feature_extractor=feature_extractor

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    inputs = self.feature_extractor(images=inputs, return_tensors="pt")
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

    #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss


## Выбираем модель-ученика

Как выбрать подходящую модель-ученика?
1. Меньшая модель чем учитель чтобы уменьшить объем занимаемой памяти и увеличить RPS

2. Дистилляция знаний работает лучше, когда модель-учитель и ученик одного типа (BERT и RoBERTa могут иметь разную длинну эмебддингов на выходе, что создает сложности для ученика мимикрировать под учителя)

В качестве примера на роль модели-ученика возьмем DistilBERT.

### Загрузка датасета

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/compression/archive.zip ./
# !mkdir data
!unzip archive.zip -d data/

In [36]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

test_dataset = datasets.ImageFolder(Path('data/'), transform=transform, target_transform=int())

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [38]:
test_dataset.labels

AttributeError: ignored

# Определим метрику которой будем замерять точность

In [11]:
accuracy_score = load_metric("accuracy")

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

В этой функции прогнозы от головы модели поступают в форме логитов, поэтому мы используем функцию np.argmax(), чтобы найти наиболее достоверный прогноз класса и сравнить его с ground truth меткой.

# Определим аргументы для тренировки

In [12]:
batch_size = 48
finetuned_student_ckpt = "swin-student"

In [13]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=1, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

## Давайте инициализируем модель ученика, но перед этим предоставим ей словари с каждым намерением и его идентификатором.

In [14]:
swin_ckpt = "microsoft/swin-tiny-patch4-window7-224"
pipe = pipeline("image-classification", model=swin_ckpt)

# id2label = pipe.model.config.id2label
# label2id = pipe.model.config.label2id

Downloading (…)lve/main/config.json:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [26]:
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-base-patch4-window7-224")

Downloading (…)rocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]



In [15]:
num_labels = len(test_dataset.classes)
student_config = (AutoConfig.from_pretrained(swin_ckpt, num_labels=num_labels,
                                            #  id2label=id2label,
                                            #  label2id=label2id
                                             ))

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def student_init():
  return (AutoModelForImageClassification.from_pretrained(swin_ckpt, config=student_config).to(device))

## Загрузим предобученные веса модели-учителя и начнем дообучение модели-ученика

In [20]:
teacher_checkpoint = "microsoft/swin-base-patch4-window7-224"
teacher_model = AutoModelForImageClassification.from_pretrained(teacher_checkpoint, num_labels=num_labels).to(device)

In [28]:
# Начинаем fine-tuning ученика
swin_trainer = KnowledgeDistillationTrainer(model_init=student_init,
                                            feature_extractor=feature_extractor,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=test_dataset, # eval_dataset=test_dataset,
        compute_metrics=compute_metrics)

swin_trainer.train()

TypeError: ignored

## Сравним модели учителя и ученика



Сохраним модели учителя и ученика, а затем вычислим размеры моделей в MB.

In [None]:
teacher_model.save_pretrained("teacher_model")
swin_trainer.save_model('student_model')

raw_student = AutoModelForImageClassification.from_pretrained(swin_ckpt, config=student_config)
raw_student.save_pretrained("raw_student_model")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

In [None]:
def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [None]:
teacher_model_parameters = compute_parameters(model_path="/content/teacher_model")
print("Teacher Model: ", teacher_model_parameters)

Teacher Model:  109598359


In [None]:
student_model_parameters = compute_parameters(model_path="/content/student_model")
print("Student Model: ", student_model_parameters)

Student Model:  67069591


In [None]:
decrease = (teacher_model_parameters - student_model_parameters) / teacher_model_parameters
print(f'Модель студента имеет на {decrease*100:.2f} % меньше параметров, чем модель учителя')

Модель студента имеет на 38.80 % меньше параметров, чем модель учителя


In [None]:
!ls /content/student_model -al --block-size=MB

total 270MB
drwxr-xr-x 2 root root   1MB Sep 27 14:52 .
drwxr-xr-x 1 root root   1MB Sep 27 14:52 ..
-rw-r--r-- 1 root root   1MB Sep 27 14:52 config.json
-rw-r--r-- 1 root root 269MB Sep 27 14:52 pytorch_model.bin
-rw-r--r-- 1 root root   1MB Sep 27 14:52 special_tokens_map.json
-rw-r--r-- 1 root root   1MB Sep 27 14:52 tokenizer_config.json
-rw-r--r-- 1 root root   1MB Sep 27 14:52 tokenizer.json
-rw-r--r-- 1 root root   1MB Sep 27 14:52 training_args.bin
-rw-r--r-- 1 root root   1MB Sep 27 14:52 vocab.txt


In [None]:
!ls /content/teacher_model -al --block-size=MB

total 439MB
drwxr-xr-x 2 root root   1MB Sep 27 14:52 .
drwxr-xr-x 1 root root   1MB Sep 27 14:52 ..
-rw-r--r-- 1 root root   1MB Sep 27 14:52 config.json
-rw-r--r-- 1 root root 439MB Sep 27 14:52 pytorch_model.bin


Выполним замер средней скорости инференса у обоих моделей на одинаковых входных данных

In [None]:
sample_input = clinc['train']['text'][101]

print(clinc['train']['text'][101])
print(clinc['train']['intent'][101])

complete a transaction from savings to checking of $20000
133


In [None]:
pipe = pipeline("text-classification", model="/content/teacher_model", tokenizer='bert-base-uncased')

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_teacher_model = time.time() - start
print("Общее время обработки 100 запросов моделью-учителем:", total_time_teacher_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Общее время обработки 100 запросов моделью-учителем: 14.024583339691162


In [None]:
from tqdm.notebook import tqdm

# ПОДСЧЕТ МЕТРИК
data_test_X = clinc['test']['text'][::50]
data_test_y = clinc['test']['intent'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

  0%|          | 0/110 [00:00<?, ?it/s]

{'accuracy': 0.8363636363636363}

In [None]:
pipe = pipeline("text-classification", model="/content/student_model", tokenizer="distilbert-base-uncased")

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

#INFERENCE
start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_student_model = time.time()-start

print("Общее время обработки 100 запросов моделью-студентом:", total_time_student_model)

Общее время обработки 100 запросов моделью-студентом: 7.051018714904785


In [None]:
decrease_in_time = (total_time_teacher_model - total_time_student_model) / total_time_teacher_model

print(f'Модель студента классифицирует быстрее на {decrease_in_time*100:.2f} %')

Модель студента классифицирует быстрее на 49.72 %


In [None]:
# ПОДСЧЕТ МЕТРИК
data_test_X = clinc['test']['text'][::50]
data_test_y = clinc['test']['intent'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

  0%|          | 0/110 [00:00<?, ?it/s]

{'accuracy': 0.5727272727272728}

In [None]:
# Raw student-model
pipe = pipeline("text-classification", model="/content/raw_student_model", tokenizer="distilbert-base-uncased")

# ПОДСЧЕТ МЕТРИК
data_test_X = clinc['test']['text'][::50]
data_test_y = clinc['test']['intent'][::50]
model_preds = []

for i in tqdm(data_test_X):
    model_preds.append(label2id[pipe(i)[0]['label']])

accuracy_score.compute(predictions=model_preds, references=data_test_y)

  0%|          | 0/110 [00:00<?, ?it/s]

{'accuracy': 0.0}

In [None]:
model_preds