In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# with open('/content/drive/My Drive/foo.txt', 'w') as f:
#   f.write('Hello Google Drive!')

# Как устроены модели в хаггинфейсе на примере классификатора


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [5]:
text = 'i love you'
model_input = tokenizer(
    text, return_tensors='pt'
)
model_input

{'input_ids': tensor([[ 101, 1045, 2293, 2017,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [6]:
with torch.inference_mode():
    probas = torch.softmax(model(**model_input).logits, dim=1)

probas.numpy().tolist()

[[0.0001343627372989431, 0.9998656511306763]]

In [7]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

# Текстовые генеративные модели в хаггинфейсе


In [8]:
from transformers import AutoModelWithLMHead

In [9]:
tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
model = AutoModelWithLMHead.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
prompt = 'привет'
model_input = tokenizer(prompt, return_tensors='pt')
model_input

{'input_ids': tensor([[960, 577]]), 'attention_mask': tensor([[1, 1]])}

In [11]:
with torch.inference_mode():
    model_output = model(**tokenizer(prompt, return_tensors='pt'))
logits = model_output.logits
logits

tensor([[[-7.9739, -9.1519, -8.5353,  ..., -8.2743, -8.5468, -8.3682],
         [-8.0564, -8.2150, -7.3547,  ..., -7.7663, -7.8596, -8.0142]]])

In [12]:
probas = logits[0, -1].softmax(dim=0)
next_token_id = probas.argmax().item()
next_token_id

16

In [13]:
prompt += tokenizer.decode(16)
prompt

'привет,'

In [14]:
prompt = 'привет'
max_new_tokens = 15
for i in range(max_new_tokens):
    inputs = tokenizer(prompt, return_tensors='pt')
    with torch.inference_mode():
        logits = model(**tokenizer(prompt, return_tensors='pt')).logits[0, -1]
    probas = logits.softmax(dim=0)
    next_token_id = probas.argmax().item()
    prompt += tokenizer.decode(next_token_id)
prompt

'привет, я тут, в Москве, в гостях у одного человека, который мне'

https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin

https://huggingface.co/docs/transformers/internal/generation_utils

In [15]:
inputs = tokenizer('привет', return_tensors='pt')
generated_token_ids = model.generate(
    **inputs,
    max_new_tokens=15
)
context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
context_with_response

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['привет, я тут, в Москве, в гостях у одного человека, который мне']

# Как обучать

https://huggingface.co/docs/transformers/tasks/language_modeling

https://github.com/tinkoff-ai/pycon-chit-chat/blob/main/notebooks/lm_training.ipynb


# Методы декодирования

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
inputs = tokenizer('@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@', return_tensors='pt')
generated_token_ids = model.generate(
    **inputs,
    top_k=10,  # sample one of k most likely
    top_p=0.95,  # sample from those most likely which some >= p
    num_beams=3,  # num beams for beam search
    num_return_sequences=3,  # how many candidates to return
    do_sample=True,  # do sample or greedy search
    no_repeat_ngram_size=2,  # n grams of this n must not repeat in a text
    temperature=1.0,  # make this value higher to get more interesting responses
    repetition_penalty=1.2,  # make this value higher to fight with repetition 
    length_penalty=0.0001,  # < 1 for short texts, > 1 for long
    eos_token_id=50257,  # when to stop
    max_new_tokens=40  # how many tokens to generate
)
context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
context_with_response

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:50257 for open-end generation.


['@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@нормально,а у тебя?как день прошел?что делаешь?у меня тоже день не задался,но я исправлюсь 😊😉🤗👍🏻',
 '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@нормально, а у тебя? Как сам? Я вот на работу собираюсь, так что не скучай! 👋🏻😉😁😊😂😘�',
 '@@ПЕРВЫЙ@@ привет @@ВТОРОЙ@@ привет @@ПЕРВЫЙ@@ как дела? @@ВТОРОЙ@@нормально, а у тебя как? Как сам? Что нового? А то я волнуюсь за тебя. 🙃👍🏻🤣💪�']

# Crossencoder

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/response-quality-classifier-large')
model = AutoModelForSequenceClassification.from_pretrained('tinkoff-ai/response-quality-classifier-large')
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]норм, у тя как?', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
relevance, specificity = probas
relevance, specificity

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(0.79521185, 0.4289922)

# Toxicity classifier

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/response-toxicity-classifier-base')
model = AutoModelForSequenceClassification.from_pretrained('tinkoff-ai/response-toxicity-classifier-base')
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]норм, у тя как?', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
probas

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


array([0.9970234 , 0.15969415, 0.07131252, 0.3618246 ], dtype=float32)

In [19]:
inputs = tokenizer('[CLS]привет[SEP]привет![SEP]как дела?[RESPONSE_TOKEN]иди нахуй', max_length=128, add_special_tokens=False, return_tensors='pt')
with torch.inference_mode():
    logits = model(**inputs).logits
    probas = torch.softmax(logits, dim=1)[0].cpu().detach().numpy()
probas.tolist()

[0.0009251515730284154,
 0.0036098435521125793,
 0.9868594408035278,
 0.008605570532381535]

In [20]:
model.config.id2label

{0: 'ok', 1: 'risks', 2: 'severe_toxic', 3: 'toxic'}