# Requirenments installation



In [1]:
from IPython.display import clear_output

In [2]:
!pip3 install urllib3==1.25.4
!pip3 install transformers==2.8.0
clear_output()

## Prepare data

In [3]:
data_path = "drive/My Drive/data.txt"
!ls "$data_path"

'drive/My Drive/data.txt'


In [4]:
with open(data_path, "r") as file:
    text = file.read()

In [5]:
valid_size = 15

In [6]:
topics = []
all_essays = []
for line in text.split("</s>"):
    if "Тема:" in line and "Стихотворение:" in line:
        essay_text = line.split("Стихотворение:")
        if len(essay_text) == 2:
            topic = essay_text[0].replace("<s>", " ").replace("</s>", " ").strip()
            essay_text = essay_text[1].replace("<s>", " ").replace("</s>", " ").strip()
            essay_text = f"Стихотворение: {essay_text}"
            essay_res = f"<s>{topic}\n{essay_text}</s>"
            all_essays.append(essay_res)
            topics.append(topic)

In [7]:
import numpy as np
import random

In [8]:
random.seed(1234)
np.random.seed(1234)

In [9]:
unique_topics = list(set(topics))

In [10]:
len(unique_topics)

183

In [11]:
valid_topics = []

In [12]:
for _ in range(valid_size):
    # Use randint for more speed (on big lists it is faster)
    idx = np.random.randint(0, len(unique_topics))
    valid_topics.append(unique_topics[idx])

In [13]:
import nltk


train = []
valid = []
for topic, essay in zip(topics, all_essays):
    is_train = True
    for valid_topic in valid_topics:
        if (
            nltk.edit_distance(valid_topic, topic[:len(valid_topic)]) < 20 or
            nltk.edit_distance(valid_topic[:len(topic)], topic) < 20 or
            nltk.edit_distance(valid_topic[len(topic):], topic) < 20 or
            nltk.edit_distance(valid_topic, topic[len(valid_topic):]) < 20
            ):
            is_train = False
    if is_train:
        train.append(essay)
    else:
        valid.append(essay)

In [14]:
len(train), len(valid)

(168, 40)

In [15]:
with open("train.txt", "w") as file:
    file.write("\n".join(train))

In [16]:
with open("valid.txt", "w") as file:
    file.write("\n".join(valid))

# Model finetuning

In [17]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

In [18]:
# Model taken from https://github.com/sberbank-ai/ru-gpts
data_path = "train.txt"
batch_size = 1
train_batch_size = 1
num_train_epochs = 5
gradient_accumulation_steps = 1
model_path = 'sberbank-ai/rugpt3small_based_on_gpt2'
learning_rate = 5e-5
adam_epsilon = 1e-8
logging_steps = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
class TextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)

        self.examples = []
        with open(file_path, encoding="utf-8") as f:
            text = f.read()

        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

        for i in range(0, len(tokenized_text) - block_size + 1, block_size): 
            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i: i + block_size]))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [20]:
def train(train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
    """ Train the model """
    
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=train_batch_size, collate_fn=collate
    )

    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=t_total
    )
    
    # Check if saved optimizer or scheduler exist
    if (
            model_path
            and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
            and os.path.isfile(os.path.join(model_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(num_train_epochs), desc="Epoch", disable=False
    )
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            outputs =  model(inputs, labels=labels)
            loss = outputs[0]  

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1


    return global_step, tr_loss / global_step

In [21]:
 config = AutoConfig.from_pretrained(model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)

 model = AutoModelWithLMHead.from_pretrained(
            model_path,
            from_tf=bool(".ckpt" in model_path),
            config=config,
        )
 model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [22]:
train_dataset = TextDataset(tokenizer, file_path=data_path, block_size=1024)

In [23]:
global_step, tr_loss = train(train_dataset, model, tokenizer)


Iteration:   1% 1/92 [00:00<01:12,  1.25it/s]
Iteration:   2% 2/92 [00:01<01:06,  1.36it/s]
Iteration:   3% 3/92 [00:02<01:02,  1.43it/s]
Iteration:   4% 4/92 [00:02<00:59,  1.49it/s]
Iteration:   5% 5/92 [00:03<00:57,  1.53it/s]
Iteration:   7% 6/92 [00:03<00:54,  1.56it/s]
Iteration:   8% 7/92 [00:04<00:53,  1.59it/s]
Iteration:   9% 8/92 [00:05<00:52,  1.60it/s]
Iteration:  10% 9/92 [00:05<00:51,  1.61it/s]
Iteration:  11% 10/92 [00:06<00:50,  1.62it/s]
Iteration:  12% 11/92 [00:06<00:49,  1.63it/s]
Iteration:  13% 12/92 [00:07<00:49,  1.63it/s]
Iteration:  14% 13/92 [00:08<00:48,  1.63it/s]
Iteration:  15% 14/92 [00:08<00:47,  1.63it/s]
Iteration:  16% 15/92 [00:09<00:47,  1.63it/s]
Iteration:  17% 16/92 [00:09<00:46,  1.64it/s]
Iteration:  18% 17/92 [00:10<00:45,  1.64it/s]
Iteration:  20% 18/92 [00:11<00:45,  1.64it/s]
Iteration:  21% 19/92 [00:11<00:44,  1.63it/s]
Iteration:  22% 20/92 [00:12<00:44,  1.62it/s]
Iteration:  23% 21/92 [00:13<00:43,  1.63it/s]
Iteration:  24% 22/92 

# Poetry generation

In [35]:
def generate_poetry(model):
    prompt_text = input("Context > ")
    
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    generated_sequences = []

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=100 + len(encoded_prompt[0]),
        temperature=0.7,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1.0,
        do_sample=True,
        num_return_sequences=1,
    )

    
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        # Remove all text after the stop token
        stop_token = '</s>'
        text = text[: text.find(stop_token) if stop_token else None]

        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
        )

        generated_sequences.append(total_sequence)

    return generated_sequences

In [38]:
print(generate_poetry(model))

Context > Я иду по лесу и вижу лису
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
['Я иду по лесу и вижу лису. Продрогшая — бьет по земле хвостом. Я увидел — глаза ослепли от страха...Что это? Стихи или только рассказ? А за ним новый рассказ-сказка!.. Грустные, грозовые, страстные, мчатся в песне весенней.Пристально слушает лиса.А я иду по лесу...Он похож на мир людей... Так! Он грустен, но жив!']
