### Курсовая работа

**Этап 1. Обучение модели**

In [1]:
!pip install transformers deepspeed --quiet

[K     |████████████████████████████████| 4.7 MB 15.1 MB/s 
[K     |████████████████████████████████| 704 kB 71.6 MB/s 
[K     |████████████████████████████████| 120 kB 73.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 46.2 MB/s 
[K     |████████████████████████████████| 54 kB 1.9 MB/s 
[K     |████████████████████████████████| 108 kB 67.4 MB/s 
[K     |████████████████████████████████| 99 kB 8.2 MB/s 
[?25h  Building wheel for deepspeed (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


In [2]:
# запишем в файл и в дальнейшем запустим его -- необходимо из-за краша
# среды выполнения в colab
%%writefile train_model.py

import os
import pickle

# определим переменные для deepspeed
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994'
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"# for ddp
os.environ['WORLD_SIZE'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "true" #uncoment for large files

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2TokenizerFast
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, PreTrainedTokenizerFast
from multiprocessing import Pool
from tqdm import tqdm
import gc
torch.manual_seed(42)
MODEL_DIR =  './drive/MyDrive/nlp-project/model/mailqa'

device = 'cuda:0'
backbone = 'sberbank-ai/rugpt3small_based_on_gpt2'

tokenizer = GPT2TokenizerFast.from_pretrained(backbone, use_fast=True)

train_path = './drive/MyDrive/nlp-project/Otvety.txt'

#необходимые функции для токенизации и датасета

def tokenize(text):
    print(f'Tokenizing text length {len(text)}')
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

# Custom dataset loader using multiprocessing to parallelize tokenization
class MailRuDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizerFast, file_path: str, block_size: int = 1024):
        if not os.path.exists('./drive/MyDrive/nlp-project/cached_dataset'):
            self.samples = []

            print('Чтение файла')
            with open(file_path, encoding='utf-8', errors='ignore') as data_file:
                data = data_file.read()

            print(f'Размер данных: {len(data)}')

            print('Разбиение файла данных')
            data_chunks = [data[i:i+2097152] for i in tqdm(range(0, len(data), 2097152))]

            del data
            gc.collect()

            print(f'Количество частей: {len(data_chunks)}')

            print('Старт токенизации')
            with Pool(8) as p:
                tokenized_text = [token for tokens in p.map(tokenize, data_chunks) for token in tokens]
                p.close()
                p.join()

            del data_chunks
            gc.collect()

            print(f'Размер токенизированного текста: {len(tokenized_text)}')

            print('Разбиение поблочно')
            self.samples = [tokenized_text[i:i+block_size] for i in range(0, len(tokenized_text) - block_size + 1, block_size)]

            print(f'Число сэмплов: {len(self.samples)}')

            del tokenized_text
            gc.collect()

            pickle.dump(self.samples, open("./drive/MyDrive/nlp-project/cached_dataset", "wb"))

            print('Датасет загружен и закеширован на диск')
        else:
            print('Загрузка датасета из кеша')
            self.samples = pickle.load(open('./drive/MyDrive/nlp-project/cached_dataset', 'rb'))

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx])

    def __len__(self):
        return len(self.samples)


def load_dataset(train_path, tokenizer):
    train_dataset = MailRuDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=1024)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    return train_dataset, data_collator

if __name__ == "__main__":
    train_dataset, data_collator = load_dataset(train_path, tokenizer)
    
    model = GPT2LMHeadModel.from_pretrained(backbone).to(device)

    training_args = TrainingArguments(output_dir=MODEL_DIR,
                                      num_train_epochs=1, 
                                      logging_steps=50, 
                                      save_steps=2000,
                                      per_device_train_batch_size=1,
                                      per_device_eval_batch_size=1,
                                      warmup_steps=100,
                                      weight_decay=0.01, 
                                      fp16=True,
                                      report_to=None,
                                      save_total_limit=5)
    trainer = Trainer(model=model, args=training_args, 
            data_collator=data_collator,
            train_dataset=train_dataset,
                      
    )

    trainer.train()

Writing train_model.py


In [3]:
!python3 train_model.py


Downloading vocab.json: 100% 1.63M/1.63M [00:00<00:00, 3.27MB/s]
Downloading merges.txt: 100% 1.21M/1.21M [00:00<00:00, 2.45MB/s]
Downloading config.json: 100% 608/608 [00:00<00:00, 610kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Загрузка датасета из кеша
Downloading pytorch_model.bin: 100% 526M/526M [00:08<00:00, 61.6MB/s]
^C
