<a href="https://colab.research.google.com/github/totminaekaterina/RUSSE-2022-Detoxification/blob/main/finetune_rugpt3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import argparse
import json
import random
from pprint import pprint
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          Trainer,
                          TrainingArguments,
                          TrainerCallback,
                          AdamW,
                          get_linear_schedule_with_warmup)

In [2]:
def set_random_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [3]:
def get_random_example(dataset):
    sample = dataset.sample()
    prompt = f'<|startoftext|>{sample["Sentence_1"].item()}<|sep|>'
    true_output = sample["Sentence_2"].item()
    return prompt, true_output

In [4]:
class SimplificationDataset(torch.utils.data.Dataset):
    def __init__(self, texts_list, tokenizer, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = tokenizer

        texts_combined = []
        for input_text, out_text in texts_list:
            text_combined = f"<|startoftext|>{input_text}<|sep|>{out_text}<|endoftext|>"
            texts_combined.append(text_combined)
        self.encodings = tokenizer(texts_combined,
                              truncation=True,
                              max_length=max_length,
                              padding="max_length",
                              return_tensors="pt")

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["label"] = item["input_ids"]
        return item

In [5]:
class PrintExampleCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        prompt, true_output = get_random_example(valid)
        print(prompt.strip("<|startoftext|>").strip("<|sep|>"), true_output, sep="\n")
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
        model.eval()
        with torch.no_grad():
            sample_outputs = model.generate(
                input_ids,
                do_sample=True,   
                top_k=50,
                max_length=MAX_LENGTH,
                top_p=0.95,
                temperature=0.9,
                num_return_sequences=1
            ).detach().cuda()
        model.train()

        for sample in sample_outputs:
            res = (tokenizer.decode(sample, skip_special_tokens=False)
                            .split("<|sep|>")[1]
                            .replace("<|pad|>", "")
                            .replace("<|endoftext|>", ""))
            print(res, "-" * 80, sep="\n")

In [6]:
TRAIN_PATH = "train_paraphrase_1.csv"
VALID_PATH = "dev_paraphrase_2_1.csv"
TEST_PATH = "test_paraphrase_2_2.csv"

In [7]:
train = pd.read_csv(TRAIN_PATH, encoding='utf-8-sig', sep=';')
valid = pd.read_csv(VALID_PATH, encoding='utf-8-sig', sep=';')
valid.drop(["id"], axis=1, inplace=True)
train.drop(["id"], axis=1, inplace=True)
valid.columns = ["Sentence_1", "Sentence_2"]

In [8]:
train.head()

Unnamed: 0,Sentence_1,Sentence_2
0,В жутком ДТП в Марокко заживо сгорели 10 человек.,В Марокко в ДТП десять человек сгорели заживо.
1,На сотрудников 'Трансаэро' завели дело о хищен...,Неназванных должностных лиц 'Трансаэро' заподо...
2,Однако я этого не заслуживал,Хотя я этого и не заслужил
3,Он со мной не разговаривает,Она не будет со мной разговаривать
4,кто несёт ответственность за жизнь и здоровье ...,Кто является ответственным за жизнь и здоровье...


# load model

In [9]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# add special tokens

In [14]:
special_tokens = {
    "bos_token": "<|startoftext|>",
    "pad_token": "<|pad|>",
    "sep_token": "<|sep|>",
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [15]:
MAX_LENGTH = 200
DATA_COLS = ["Sentence_1", "Sentence_2"]
train_dataset = SimplificationDataset(train[DATA_COLS].values.tolist(), tokenizer, max_length=MAX_LENGTH)
valid_dataset = SimplificationDataset(valid[DATA_COLS].values.tolist(), tokenizer, max_length=MAX_LENGTH)

EPOCH_STEPS = len(train_dataset) // 8 // 1
EVAL_STEPS = EPOCH_STEPS
print(f"Total steps: {EPOCH_STEPS * 5}\nEvaluate and save every {EVAL_STEPS} steps.")

Total steps: 218745
Evaluate and save every 43749 steps.


In [20]:
training_args = TrainingArguments(
    output_dir="result_rugpt3small",
    logging_dir="logs_rugpt3small",
    logging_first_step=True,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    logging_steps=100,
    lr_scheduler_type="linear",
    warmup_steps=500,
    learning_rate=0.00005,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    weight_decay=0,
    fp16=False,
    seed=19,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
Path(training_args.output_dir).mkdir(exist_ok=True)
Path(training_args.logging_dir).mkdir(exist_ok=True)

In [22]:
pprint(training_args.to_dict())

{'adafactor': False,
 'adam_beta1': 0.9,
 'adam_beta2': 0.999,
 'adam_epsilon': 1e-08,
 'auto_find_batch_size': False,
 'bf16': False,
 'bf16_full_eval': False,
 'data_seed': None,
 'dataloader_drop_last': False,
 'dataloader_num_workers': 0,
 'dataloader_pin_memory': True,
 'ddp_bucket_cap_mb': None,
 'ddp_find_unused_parameters': None,
 'ddp_timeout': 1800,
 'debug': [],
 'deepspeed': None,
 'disable_tqdm': False,
 'do_eval': True,
 'do_predict': False,
 'do_train': False,
 'eval_accumulation_steps': None,
 'eval_delay': 0,
 'eval_steps': 43749,
 'evaluation_strategy': 'steps',
 'fp16': False,
 'fp16_backend': 'auto',
 'fp16_full_eval': False,
 'fp16_opt_level': 'O1',
 'fsdp': [],
 'fsdp_min_num_params': 0,
 'fsdp_transformer_layer_cls_to_wrap': None,
 'full_determinism': False,
 'gradient_accumulation_steps': 1,
 'gradient_checkpointing': False,
 'greater_is_better': None,
 'group_by_length': False,
 'half_precision_backend': 'auto',
 'hub_model_id': None,
 'hub_private_repo': False

In [23]:
with open("run_parameters.txt", "w") as f:
  pprint(training_args.to_dict(), f)

trainer = Trainer(
    model=model.to(device),
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[PrintExampleCallback],
)

trainer.train()

***** Running training *****
  Num examples = 349999
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 65625
  Number of trainable parameters = 125229312
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss
43749,0.221,0.221063


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 16
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


на каких землях располагаются леса
Леса располагаются на какихземлях чаще
Как называются земли на которых располагаются леса?
--------------------------------------------------------------------------------


Saving model checkpoint to result_rugpt3small\checkpoint-43749
Configuration saved in result_rugpt3small\checkpoint-43749\config.json
Model weights saved in result_rugpt3small\checkpoint-43749\pytorch_model.bin
tokenizer config file saved in result_rugpt3small\checkpoint-43749\tokenizer_config.json
Special tokens file saved in result_rugpt3small\checkpoint-43749\special_tokens_map.json
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=65625, training_loss=0.23686844018845332, metrics={'train_runtime': 35348.7541, 'train_samples_per_second': 29.704, 'train_steps_per_second': 1.857, 'total_flos': 1.071702537984e+17, 'train_loss': 0.23686844018845332, 'epoch': 3.0})

In [29]:
trainer.save_model('16_model_3')

Saving model checkpoint to 16_model_3
Configuration saved in 16_model_3\config.json
Model weights saved in 16_model_3\pytorch_model.bin
tokenizer config file saved in 16_model_3\tokenizer_config.json
Special tokens file saved in 16_model_3\special_tokens_map.json
