In [1]:
from transformers import AutoTokenizer,AutoModelForCausalLM

model_path = "/mnt/ssd/models/rugpt3small_based_on_gpt2"


tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[2023-08-18 22:06:21,296] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
import os
import numpy as np
from transformers import AutoTokenizer


class BooksStorage:
    def __init__(self, tokenizer: AutoTokenizer, output_file: str, min_doc_len: int = 100):
        self.tokenizer = tokenizer
        self._output_file = output_file
        if os.path.isfile(self._output_file):
            os.remove(self._output_file)
        self._min_doc_len = min_doc_len
        self._tokens_datatype = np.int32
        self._tokens_datasize = 4
        self._base_length = 0

    def from_txt_files(self, folder: str, encoding: str = "utf8"):
        num = 0
        print(f"Strarting to process folder '{folder}':")
        
        for file in os.listdir(folder):
            if file.endswith(".txt"):
                print(f"Process file '{file}'...")
                file_path = os.path.join(folder, file)
                text = self._read_txt_file(file_path, encoding)
                self._process_text(text)
                num += 1
        print(f"Finish: {num} files processed, {self._base_length} tokens collected.")
    
    def _read_txt_file(self, file_path: str, encoding: str):
        with open(file_path, "r", encoding=encoding) as f:
            return(f.read())
        
    def _process_text(self, text: str):
        docs = []
        for part in text.split("\n"):
            if len(part) >= self._min_doc_len:
                docs.append(part)
        for doc_ids in tokenizer(docs, padding=False).input_ids:
            # print(doc_ids)
            self._write_to_output_file(doc_ids + [tokenizer.eos_token_id])

    def _write_to_output_file(self, tokens: list):
        part = np.array(tokens, dtype=self._tokens_datatype)
        with open(self._output_file, "ab") as f:
            part.tofile(f)
        self._base_length += len(part)
        

    def get_chunk(self, position: int, length: int) -> list[int]:
        return np.fromfile(
            self._output_file, 
            dtype = self._tokens_datatype, 
            offset = self._tokens_datasize * position,
            count = length
            )

    @property
    def length(self) -> int:
        return self._base_length



books_storage = BooksStorage(tokenizer, "temp/freud.data")
books_storage.from_txt_files("data/Freud", "windows-1251")

Strarting to process folder 'data/Freud':
Process file 'Freyd Zigmund. Analiz konechnyy i beskonechnyy.txt'...
Process file 'Freyd Zigmund. O narcizme.txt'...
Process file 'Freyd Zigmund. Stroki biografii.txt'...
Process file 'Freyd Zigmund. Znamenitye sluchai iz praktiki.txt'...
Process file 'Freyd Zigmund. O psihoanalize.txt'...
Process file 'Freyd Zigmund. Etot chelovek Moisey.txt'...
Process file 'Freyd Zigmund. Ocherk istorii psihoanaziza.txt'...
Process file 'Freyd Zigmund. Metapsihologicheskoe dopolnenie k ucheniyu o snovideniyah.txt'...
Process file 'Freyd Zigmund. Nedovolstvo kulturoy.txt'...
Process file 'Freyd Zigmund. Vospominanie vosproizvedenie i pererabotka.txt'...
Process file 'Freyd Zigmund. Ocherki po psihologii seksualnosti.txt'...
Process file 'Freyd Zigmund. Vvedenie V Psihoanaliz. Lekcii.txt'...
Process file 'Freyd Zigmund. Rebenka byut - k voprosu o proishozhdenii seksualnyh izvrascheniy.txt'...
Process file 'Freyd Zigmund. Psihopatologiya obydennoy zhizni.txt'..

In [59]:
from torch.utils.data import Dataset
import torch

class BooksDataset(Dataset):
    def __init__(self, 
    books_storage: BooksStorage,
    chunk_size: int,
    indexes: list[int]):
        self._books_storage = books_storage
        self._chunk_size = chunk_size
        self._indexes = indexes

    def __getitem__(self, index):
        target_index = self._indexes[index]
        position = target_index * self._chunk_size
        input_ids = self._books_storage.get_chunk(position, self._chunk_size)
        label_ids = [-100 if t == self._books_storage.tokenizer.eos_token_id else t for t in input_ids]
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return len(self._indexes)

In [60]:
from random import sample


class BooksCollector:
    def __init__(self, 
    books_storage: BooksStorage, 
    chunk_size: int):
        self._books_storage = books_storage
        self._chunk_size = chunk_size
        self._chunks_number = self._books_storage.length // self._chunk_size

    @property
    def length(self):
        return self._chunks_number

    def train_test_split(self, test_part: float = 0.2, shuffle: bool = True):
        indexes = [i for i in range(self._chunks_number)]
        if shuffle:
            indexes = sample(indexes, k = self._chunks_number)
        split_position = int(self.length * test_part)
        test_dataset = BooksDataset(self._books_storage, self._chunk_size, indexes[:split_position])
        train_dataset = BooksDataset(self._books_storage, self._chunk_size, indexes[split_position:])
        return  train_dataset, test_dataset


books_collector = BooksCollector(books_storage, 2048)
train_dataset, test_dataset = books_collector.train_test_split()

In [61]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="temp",
    num_train_epochs = 5,
    evaluation_strategy = "epoch",
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

  2%|▏         | 50/2430 [00:27<21:36,  1.84it/s]

KeyboardInterrupt: 

In [49]:
tokens = tokenizer("Психоаналитическое исследование с самого начала указывало").input_ids
output = model.generate(
    torch.tensor([tokens]).cuda(),
    max_new_tokens=40,
    return_output_length=True
)[0]

tokenizer.decode(output)

ValueError: The following `model_kwargs` are not used by the model: ['return_output_length'] (note: typos in the generate arguments will also show up in this list)