In [None]:
from typing import Any
from transformers import BertForMaskedLM, BertTokenizer, BertConfig
from transformers.models.bert.modeling_bert import BertEncoder, BertModel, BertPreTrainedModel
from torch.nn import Module
from torch import Tensor

In [None]:
tokenizer = BertTokenizer.from_pretrained('/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/HelBERT-uncased-fs/checkpoint-epoca-6')
modelo = BertModel.from_pretrained('/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/HelBERT-uncased-fs/checkpoint-epoca-6')

In [None]:
def visualize_children(
    object : Any,
    level : int = 0,
) -> None:
    """
    Prints the children of (object) and their children too, if there are any.
    Uses the current depth (level) to print things in a ordonnate manner.
    """
    print(f"{'   ' * level}{level}- {type(object).__name__}")
    try:
        for child in object.children():
            visualize_children(child, level + 1)
    except:
        pass

visualize_children(modelo)

In [None]:
def distill_bert_weights(
    teacher: Module,
    student: Module,
) -> None:
    """
    Recursively copies the weights of the (teacher) to the (student).
    This function is meant to be first called on a BertFor... model, but is then called on every children of that model recursively.
    The only part that's not fully copied is the encoder, of which only half is copied.
    """
    # If the part is an entire BERT model or a BertFor..., unpack and iterate
    if isinstance(teacher, BertModel) or type(teacher).__name__.startswith('BertFor'):
        for teacher_part, student_part in zip(teacher.children(), student.children()):
            distill_bert_weights(teacher_part, student_part)
    # Else if the part is an encoder, copy one out of every layer
    elif isinstance(teacher, BertEncoder):
        teacher_encoding_layers = [layer for layer in next(teacher.children())]
        student_encoding_layers = [layer for layer in next(student.children())]
        for i in range(len(student_encoding_layers)):
            # Camadas pares
            #student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2 * i].state_dict())
            # Primeiras 6 camadas
            #student_encoding_layers[i].load_state_dict(teacher_encoding_layers[i].state_dict())
            # Últimas 6 camadas
            #student_encoding_layers[i].load_state_dict(teacher_encoding_layers[i+6].state_dict())
            # Camadas ímpares
            student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2 * i + 1].state_dict())
            print(2 * i + 1)
    # Else the part is a head or something else, copy the state_dict
    else:
        student.load_state_dict(teacher.state_dict())

In [None]:
def distill_bert(
    teacher_model: BertPreTrainedModel,
) -> BertPreTrainedModel:
    """
    Distillates a BERT (teacher_model) like would DistilBERT for a BERT model.
    The student model has the same configuration, except for the number of hidden layers, which is // by 2.
    The student layers are initialized by copying one out of two layers of the teacher, starting with layer 0.
    The head of the teacher is also copied.
    """
    # Get teacher configuration as a dictionary
    configuration = teacher_model.config.to_dict()
    # Half the number of hidden layers
    configuration['num_hidden_layers'] //= 2
    # Convert the dictionary to the student configuration
    configuration = BertConfig.from_dict(configuration)
    # Create uninitialized student model
    student_model = type(teacher_model)(configuration)
    # Initialize the student's weights
    distill_bert_weights(teacher=teacher_model, student=student_model)
    # Return the student model
    print(student_model.config)
    return student_model

In [None]:
student = distill_bert(modelo)

In [None]:
import torch
from torch.nn import CrossEntropyLoss, CosineEmbeddingLoss

def distillation_loss(
    teacher_logits : Tensor,
    student_logits : Tensor,
    labels : Tensor,
    temperature : float = 1.0,
) -> Tensor:
    """
    The distillation loss for distilating a BERT-like model.
    The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
    The (temperature) can be given, otherwise it's set to 1 by default.
    """
    # Temperature and sotfmax
    student_logits, teacher_logits = (student_logits / temperature).softmax(1), (teacher_logits / temperature).softmax(1)
    # Classification loss (problem-specific loss)
    loss = CrossEntropyLoss()(student_logits, labels)
    # CrossEntropy teacher-student loss
    loss = loss + CrossEntropyLoss()(student_logits, teacher_logits)
    # Cosine loss
    loss = loss + CosineEmbeddingLoss()(teacher_logits, student_logits, torch.ones(teacher_logits.size()[0]))
    # Average the loss and return it
    loss = loss / 3
    return loss

In [None]:
student.save_pretrained("/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/distilHelBERT-base-camadas-impares")
tokenizer.save_pretrained("/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/distilHelBERT-base-camadas-impares")

In [None]:
student = BertForMaskedLM.from_pretrained("/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/distilHelBERT-base-camadas-impares")
tokenizer = BertTokenizer.from_pretrained("/var/projetos/Jupyterhubstorage/victor.silva/HelBERTModel/Modelos/PreTreinamento/distilHelBERT-base-camadas-impares")

In [None]:
from transformers import pipeline
fill = pipeline("fill-mask", model=student, tokenizer=tokenizer)

In [None]:
fill("[MASK] da licitacao")