# Fine-tune ruDialoGPT


## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.2 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelWithLMHead, PreTrainedTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split

from tqdm import tqdm

import logging

## Args for more hyperparametrs

In [4]:
class Args():
    def __init__(self):
        self.max_lenght = 128
        self.batch_size = 8
        self.num_epochs = 1
        self.lr = 5e-5 ## learning rate
        self.eps_AdamW = 1e-8
        self.sp1_token = '@@ПЕРВЫЙ@@'
        self.sp2_token = '@@ВТОРОЙ@@'
        self.sp1_token_id = 50257
        self.sp2_token_id = 50258
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

args = Args()
args.device

device(type='cuda')

## Data

In [5]:
df = pd.read_csv('/content/drive/MyDrive/tinkoff_sirius_nlp/data/data.csv')

In [6]:
df.shape

(548190, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548190 entries, 0 to 548189
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   context_3  146064 non-null  object
 1   context_2  226774 non-null  object
 2   context_1  373677 non-null  object
 3   response   534430 non-null  object
dtypes: object(4)
memory usage: 16.7+ MB


## Data processing

In [8]:
def data_preprocessing(text: str) -> str:
    text = text.replace('\n', ' ').replace('__', '').replace('**', '').replace('||', '')
    text_without_urls = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text_without_english = re.sub(r'[a-zA-Z]', '', text_without_urls)
    text_without_spaces = text_without_english.strip()

    return text_without_spaces

In [9]:
df = df[df.isna().sum(axis=1) == 0]

In [10]:
df = df.applymap(data_preprocessing).dropna()

In [11]:
df.shape

(137714, 4)

In [12]:
df.to_csv('/content/drive/MyDrive/tinkoff_sirius_nlp/dataset_final.csv', index=False)

## Load Model

In [13]:
tokenizer = AutoTokenizer.from_pretrained('tinkoff-ai/ruDialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('tinkoff-ai/ruDialoGPT-medium')

Downloading (…)okenizer_config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [14]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50261, bias=False)
)

## Concatenation context and creating custom dataset
On this step we concate all context and response messages in one text that separate by special tokens between each of message.

In [15]:
def concate_turns(df: pd.DataFrame) -> list:
    examples = []
    for _, row in df.iterrows():
        context = str()

        for idx, col in enumerate(df.columns):
            if idx % 2 == 0:
                context += args.sp1_token + row[col]
            else:
                context += args.sp2_token + row[col]

        context += args.sp1_token

        examples.append(context)

    return examples


In [16]:
df.shape

(137714, 4)

In [17]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42 ,shuffle=True)

In [18]:
train_df.shape, test_df.shape

((110171, 4), (27543, 4))

In [19]:
context_examples_train = concate_turns(train_df)
context_examples_test = concate_turns(test_df)

In [20]:
import gc

del df
del train_df
del test_df

gc.collect()

76

In [21]:
class DialogueDataset(Dataset):
    def __init__(self, tokenizer: AutoTokenizer, data: list, max_lenght: int):
        self.data = data
        self.tokenizer = tokenizer
        self.max_lenght = max_lenght

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        context = self.data[idx]

        encoded_context = self.tokenizer.encode(
            context, max_length=self.max_lenght, truncation=True, padding='max_length', return_tensors="pt"
        )

        return encoded_context

In [22]:
train_dataset = DialogueDataset(tokenizer, context_examples_train, args.max_lenght)
train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
test_dataset = DialogueDataset(tokenizer, context_examples_test, args.max_lenght)
test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True)

next(iter(train_dataloader)).shape

torch.Size([8, 1, 128])

## Fine-Tune

In [23]:
optimizer = AdamW(model.parameters(), lr=args.lr, eps=args.eps_AdamW)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) // args.batch_size * args.num_epochs
    )



In [None]:
#model.resize_token_embeddings(len(tokenizer))

In [24]:
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 1024)
    (wpe): Embedding(2048, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50261, bias=False)
)

In [25]:
def evaluate(args, model: AutoModelWithLMHead, test_dataloader: DataLoader):
    model.eval()
    eval_loss = 0.0
    nb_eval_steps = 0

    for batch in test_dataloader:
            inputs = batch.to(args.device)
            labels = batch.clone().to(args.device)

            with torch.no_grad():
                outputs = model(inputs, labels=labels)
                lm_loss = outputs.loss
                eval_loss += lm_loss.item()
            nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    return perplexity.item(), eval_loss

In [26]:
train_loss = []

valid_loss = []
valid_perplexity = []

for epoch in range(args.num_epochs):

    pbar = tqdm(train_dataloader, desc=f"Training epoch {epoch + 1}/{args.num_epochs}", total=len(train_dataloader))
    sum_of_loss = 0.0

    for batch_idx, batch in enumerate(pbar):
        model.train()

        inputs = batch.to(args.device)
        labels = batch.clone().to(args.device)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss

        train_loss.append(loss.item())

        loss.backward()
        optimizer.step()

        if((batch_idx + 1) % (len(train_dataloader) // 3) == 0):

            perplexity, eval_loss = evaluate(args, model, test_dataloader)
            valid_perplexity.append(perplexity)
            valid_loss.append(eval_loss)

    output_dir = f'/content/drive/MyDrive/tinkoff_sirius_nlp/epoch{epoch + 1}_ruDialoGPT_dvach'
    model.save_pretrained(output_dir)

Training epoch 1/1: 100%|██████████| 13772/13772 [3:40:12<00:00,  1.04it/s] 


## Loss

valid loss in three measurements in learning time. Each of evaluate were after completed 1/3 of training dataset.

In [27]:
valid_loss

[1.6200730172555322, 1.5528745819169907, 1.492369023894211]

In [28]:
valid_perplexity

[5.053459167480469, 4.725033283233643, 4.447619915008545]

## Test generate

In [30]:
model = AutoModelWithLMHead.from_pretrained('/content/drive/MyDrive/tinkoff_sirius_nlp/epoch1_ruDialoGPT_dvach')



In [31]:
inputs = tokenizer('@@ПЕРВЫЙ@@Как стать самым умным человеком?@@ВТОРОЙ@@', return_tensors='pt')
generated_token_ids = model.generate(
    **inputs,
    top_k=10,
    top_p=0.95,
    num_beams=3,
    num_return_sequences=1,
    do_sample=True,
    no_repeat_ngram_size=2,
    temperature=1.2,
    repetition_penalty=1.2,
    length_penalty=1.0,
    eos_token_id=50257,
    max_new_tokens=40
)
context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
context_with_response

Setting `pad_token_id` to `eos_token_id`:50257 for open-end generation.


['@@ПЕРВЫЙ@@Как стать самым умным человеком?@@ВТОРОЙ@@Учиться, учиться и ещё раз учиться.@@ПЕРВЫЙ@@']