In [147]:
model_dir = ""
from transformers import AutoTokenizer,AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("../../models/rugpt3small_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("../../models/rugpt3small_based_on_gpt2")


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file ../../models/rugpt3small_based_on_gpt2/config.json
Model config GPT2Config {
  "_name_or_path": "../../models/rugpt3small_based_on_gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.24.0",
  "use_cache":

In [149]:
import os
import numpy as np
from transformers import AutoTokenizer


class BooksStorage:
    def __init__(self, tokenizer: AutoTokenizer, output_file: str, min_doc_len: int = 100):
        self.tokenizer = tokenizer
        self._output_file = output_file
        if os.path.isfile(self._output_file):
            os.remove(self._output_file)
        self._min_doc_len = min_doc_len
        self._tokens_datatype = np.int32
        self._tokens_datasize = 4
        self._base_length = 0

    def from_txt_files(self, folder: str, encoding: str = "utf8"):
        num = 0
        print(f"Strarting to process folder '{folder}':")
        
        for file in os.listdir(folder):
            if file.endswith(".txt"):
                print(f"Process file '{file}'...")
                file_path = os.path.join(folder, file)
                text = self._read_txt_file(file_path, encoding)
                self._process_text(text)
                num += 1
        print(f"Finish: {num} files processed, {self._base_length} tokens collected.")
    
    def _read_txt_file(self, file_path: str, encoding: str):
        with open(file_path, "r", encoding=encoding) as f:
            return(f.read())
        
    def _process_text(self, text: str):
        docs = []
        for part in text.split("\n"):
            if len(part) >= self._min_doc_len:
                docs.append(part)
        for doc_ids in tokenizer(docs, padding=False).input_ids:
            # print(doc_ids)
            self._write_to_output_file(doc_ids + [tokenizer.eos_token_id])

    def _write_to_output_file(self, tokens: list):
        part = np.array(tokens, dtype=self._tokens_datatype)
        with open(self._output_file, "ab") as f:
            part.tofile(f)
        self._base_length += len(part)
        

    def get_chunk(self, position: int, length: int) -> list[int]:
        return np.fromfile(
            self._output_file, 
            dtype = self._tokens_datatype, 
            offset = self._tokens_datasize * position,
            count = length
            )

    @property
    def length(self) -> int:
        return self._base_length



books_storage = BooksStorage(tokenizer, "temp/freud.data")
books_storage.from_txt_files("data/Freud", "windows-1251")

Strarting to process folder 'data/Freud':
Process file 'Freyd Zigmund. Psihologiya bessoznatelnogo.txt'...
Process file 'Freyd Zigmund. Buduschee odnoy illyuzii.txt'...
Process file 'Freyd Zigmund. Fragment analiza isterii (Istoriya bolezni Dory).txt'...
Process file 'Freyd Zigmund. Analiz konechnyy i beskonechnyy.txt'...
Process file 'Freyd Zigmund. Infantilnoe vozvraschenie totema.txt'...
Process file 'Freyd Zigmund. Totem i tabu. Psihologiya pervobytnoy kultury i religii.txt'...
Process file 'Freyd Zigmund. My i smert.txt'...
Process file 'Freyd Zigmund. Moisey i monoteizm.txt'...
Process file 'Freyd Zigmund. Etot chelovek Moisey.txt'...
Process file 'Freyd Zigmund. Vvedenie v psihoanaliz (Lekcii 16-28 chast 3).txt'...
Process file 'Freyd Zigmund. O psihoanalize.txt'...
Process file 'Freyd Zigmund. Zabyvanie inostrannyh slov.txt'...
Process file 'Freyd Zigmund. Ya i Ono.txt'...
Process file 'Freyd Zigmund. Pechal i melanholiya.txt'...
Process file 'Freyd Zigmund. Metapsihologichesko

In [150]:
from torch.utils.data import Dataset
import torch

class BooksDataset(Dataset):
    def __init__(self, 
    books_storage: BooksStorage,
    chunk_size: int,
    indexes: list[int]):
        self._books_storage = books_storage
        self._chunk_size = chunk_size
        self._indexes = indexes

    def __getitem__(self, index):
        target_index = self._indexes[index]
        position = target_index * self._chunk_size
        input_ids = torch.tensor(self._books_storage.get_chunk(position, self._chunk_size), dtype=torch.long)
        return {
            "input_ids": input_ids,
            "labels": input_ids
        }

    def __len__(self):
        return len(self._indexes)

In [151]:
from random import sample


class BooksCollector:
    def __init__(self, 
    books_storage: BooksStorage, 
    chunk_size: int):
        self._books_storage = books_storage
        self._chunk_size = chunk_size
        self._chunks_number = self._books_storage.length // self._chunk_size

    @property
    def length(self):
        return self._chunks_number

    def train_test_split(self, test_part: float = 0.2, shuffle: bool = True):
        indexes = [i for i in range(self._chunks_number)]
        if shuffle:
            indexes = sample(indexes, k = self._chunks_number)
        split_position = int(self.length * test_part)
        test_dataset = BooksDataset(self._books_storage, self._chunk_size, indexes[:split_position])
        train_dataset = BooksDataset(self._books_storage, self._chunk_size, indexes[split_position:])
        return  train_dataset, test_dataset


books_collector = BooksCollector(books_storage, 100)
train_dataset, test_dataset = books_collector.train_test_split()

In [153]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="temp",
    num_train_epochs = 5,
    evaluation_strategy = "epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 19901
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6220
  Number of trainable parameters = 125231616


  0%|          | 0/6220 [00:00<?, ?it/s]

KeyboardInterrupt: 