# Как устроены модели в хаггинфейсе на примере классификатора


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
text = 'i love you'
model_input = tokenizer(
    text, return_tensors='pt'
)
model_input

In [None]:
with torch.inference_mode():
    probas = torch.softmax(model(**model_input).logits, dim=1)

probas.numpy().tolist()

In [None]:
model.config.id2label

# Текстовые генеративные модели в хаггинфейсе


In [None]:
from transformers import AutoModelWithLMHead

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
model = AutoModelWithLMHead.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')

In [None]:
prompt = 'привет'
model_input = tokenizer(prompt, return_tensors='pt')
model_input

In [None]:
with torch.inference_mode():
    model_output = model(**tokenizer(prompt, return_tensors='pt'))
logits = model_output.logits
logits

In [None]:
probas = logits[0, -1].softmax(dim=0)
next_token_id = probas.argmax().item()
next_token_id

In [None]:
prompt += tokenizer.decode(16)
prompt

In [None]:
prompt = 'привет'
max_new_tokens = 15
for i in range(max_new_tokens):
    inputs = tokenizer(prompt, return_tensors='pt')
    with torch.inference_mode():
        logits = model(**tokenizer(prompt, return_tensors='pt')).logits[0, -1]
    probas = logits.softmax(dim=0)
    next_token_id = probas.argmax().item()
    prompt += tokenizer.decode(next_token_id)
prompt

https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin

https://huggingface.co/docs/transformers/internal/generation_utils

In [None]:
inputs = tokenizer('привет', return_tensors='pt')
generated_token_ids = model.generate(
    **inputs,
    max_new_tokens=15
)
context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
context_with_response

# Как обучать

https://huggingface.co/docs/transformers/tasks/language_modeling

https://github.com/tinkoff-ai/pycon-chit-chat/blob/main/notebooks/lm_training.ipynb


# Методы декодирования

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
inputs = tokenizer('@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@', return_tensors='pt')
generated_token_ids = model.generate(
    **inputs,
    top_k=10,  # sample one of k most likely
    top_p=0.95,  # sample from those most likely which some >= p
    num_beams=3,  # num beams for beam search
    num_return_sequences=3,  # how many candidates to return
    do_sample=True,  # do sample or greedy search
    no_repeat_ngram_size=2,  # n grams of this n must not repeat in a text
    temperature=1.0,  # make this value higher to get more interesting responses
    repetition_penalty=1.2,  # make this value higher to fight with repetition 
    length_penalty=0.0001,  # < 1 for short texts, > 1 for long
    eos_token_id=50257,  # when to stop
    max_new_tokens=40  # how many tokens to generate
)
context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
context_with_response

# Crossencoder

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/response-quality-classifier-large')
model = AutoModelForSequenceClassification.from_pretrained('tinkoff-ai/response-quality-classifier-large')
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]норм, у тя как?', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
relevance, specificity = probas
relevance, specificity

# Toxicity classifier

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/response-toxicity-classifier-base')
model = AutoModelForSequenceClassification.from_pretrained('tinkoff-ai/response-toxicity-classifier-base')
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]норм, у тя как?', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
probas

In [None]:
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]иди нахуй', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.softmax(logits, dim=1)[0].cpu().detach().numpy()
probas.tolist()

In [None]:
model.config.id2label