In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [3]:
model_name = "ai-forever/rugpt2large"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [4]:
tokenizer.add_special_tokens({'pad_token': '<|pad|>', 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>'})
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 1280)

In [5]:
def load_dataset(file_path, tokenizer, block_size=256):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

In [None]:
!gdown 18oCseBD3UpD2ode0TQqMHlcLMULNa1cZ

In [6]:
file_path = "aneki.txt"
train_dataset = load_dataset(file_path, tokenizer)



In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [8]:
training_args = TrainingArguments(
    output_dir="./model_output",       # –ö—É–¥–∞ —Å–æ—Ö—Ä–∞–Ω—è—Ç—å –º–æ–¥–µ–ª—å
    overwrite_output_dir=True,        # –ü–µ—Ä–µ–∑–∞–ø–∏—Å—ã–≤–∞—Ç—å –ª–∏ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é
    num_train_epochs=3,               # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö
    per_device_train_batch_size=4,    # –†–∞–∑–º–µ—Ä –±–∞—Ç—á–µ–π
    gradient_accumulation_steps=8,    # –£–≤–µ–ª–∏—á–∏–≤–∞–µ—Ç —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã–π —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞
    save_steps=500,                   # –ö–∞–∫ —á–∞—Å—Ç–æ —Å–æ—Ö—Ä–∞–Ω—è—Ç—å —á–µ–∫–ø–æ–∏–Ω—Ç—ã
    save_total_limit=2,               # –ú–∞–∫—Å–∏–º—É–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–Ω—ã—Ö —á–µ–∫–ø–æ–∏–Ω—Ç–æ–≤
    logging_dir="./logs",             # –õ–æ–≥–∏
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=True,                        # –ò—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –ª–∏ —Å–º–µ—à–∞–Ω–Ω—É—é —Ç–æ—á–Ω–æ—Å—Ç—å
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

trainer.train()

Step,Training Loss
100,3.4881
200,3.0641
300,3.0263
400,2.9967
500,2.9176
600,2.8381
700,2.8422
800,2.8259
900,2.8175
1000,2.5772


TrainOutput(global_step=1353, training_loss=2.8445280177982664, metrics={'train_runtime': 2670.7288, 'train_samples_per_second': 16.232, 'train_steps_per_second': 0.507, 'total_flos': 4.71054907342848e+16, 'train_loss': 2.8445280177982664, 'epoch': 2.9958483254912815})

In [9]:
trainer.save_model("./fine_tuned_rugpt2")
tokenizer.save_pretrained("./fine_tuned_rugpt2")

('./fine_tuned_rugpt2/tokenizer_config.json',
 './fine_tuned_rugpt2/special_tokens_map.json',
 './fine_tuned_rugpt2/vocab.json',
 './fine_tuned_rugpt2/merges.txt',
 './fine_tuned_rugpt2/added_tokens.json')

In [10]:
from transformers import pipeline

In [11]:
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_rugpt2")

fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_rugpt2")

In [24]:
generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer, device=0)

num_samples = 5

result = generator(
    "<|startoftext|>",
    max_length=100,             # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞
    num_return_sequences=5,    # –°–∫–æ–ª—å–∫–æ —Å—ç–º–ø–ª–æ–≤ –Ω—É–∂–Ω–æ
    do_sample=True,            # –í–∫–ª—é—á–µ–Ω–∏–µ —Å—Ç–æ—Ö–∞—Å—Ç–∏—á–µ—Å–∫–æ–π –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
    temperature=1.0,           # –†–µ–≥—É–ª–∏—Ä–æ–≤–∫–∞ —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏—è
    top_k=50,                  # –û—Å—Ç–∞–≤–∏—Ç—å —Ç–æ–ª—å–∫–æ 50 –Ω–∞–∏–±–æ–ª–µ–µ –≤–µ—Ä–æ—è—Ç–Ω—ã—Ö —Ç–æ–∫–µ–Ω–æ–≤
    top_p=0.9                  # Nucleus sampling: –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å —Å–ª–æ–≤–∞, —Å—É–º–º–∞ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π –∫–æ—Ç–æ—Ä—ã—Ö <= 0.9
)

for i, sample in enumerate(result):
    print(f"\nSample {i + 1}:\n{sample['generated_text']}\n")
    print('#' * 120)


Sample 1:
<|startoftext|>–í –æ–¥–Ω–æ–º –∏–∑ –∫–∞—Ñ–µ.- –ú–∞–¥–∞–º, —ç—Ç–æ –∂–µ –≤—ã –≤—á–µ—Ä–∞ –ø—Ä–æ–¥–∞–ª–∏ –º–Ω–µ —Ç–æ—Ä—Ç "–ù–∞–ø–æ–ª–µ–æ–Ω"?- –î–∞, –∞ —á—Ç–æ?- –ê –ø–æ—á–µ–º—É —É –≤–∞—Å —Ç–æ–≥–¥–∞ –Ω–µ—Ç –Ω–∏ –æ–¥–Ω–æ–≥–æ —Ä–µ–±–µ–Ω–∫–∞?!

–ñ—É—Ä–Ω–∞–ª–∏—Å—Ç —Å–ø—Ä–∞—à–∏–≤–∞–µ—Ç —É –∫–∏–Ω–æ–∑–≤–µ–∑–¥—ã, –ø–æ—á–µ–º—É –æ–Ω–∞ –¥–æ —Å–∏—Ö –ø–æ—Ä –Ω–µ –∑–∞–º—É–∂–µ–º.- –ü–æ–Ω–∏–º–∞–µ—Ç–µ, —è –Ω–µ —Ö–æ—á—É –≤—ã—Ö–æ–¥–∏—Ç—å –∑–∞–º—É–∂, —á—Ç–æ–±—ã –Ω–µ –ø–æ—Ä—Ç–∏—Ç—å —Å–≤–æ–π –≥–∞—Ä–¥–µ—Ä–æ–±.

- –ö–∞–∫ —Ç—ã –ø–æ–∑–Ω–∞–∫–æ–º–∏–ª—Å—è —Å –º–æ–µ–π –∂–µ–Ω–æ–π?- –Ø –ø—Ä–æ—Ö–æ–¥–∏–ª –º–∏–º–æ –∏ —É–≤–∏–¥–µ–ª, –∫–∞–∫ –æ–Ω–∞ –∏–¥–µ—Ç –ø–æ —É–ª–∏—Ü–µ, –Ω—É —è –∏ —Ä–µ—à–∏–ª –ø–æ–¥–æ–π—Ç–∏ –∏ –ø–æ–∑–Ω–∞–∫–æ–º–∏—Ç—å—Å—è

########################################################################################################################

Sample 2:
<|startoftext|>–û–± —ç—Ç–æ–º —É–∂–µ –∑–Ω–∞—é—Ç –≤—Å–µ, –∫—Ä–æ–º–µ —Ç–µ—Ö, –∫—Ç–æ –Ω–∞ —ç—Ç–æ –ø–æ–¥–ø–∏—Å–∞–ª—Å—è.

- –î–∞ –≤—ã –ø—Ä–æ—Å—Ç–æ –ø@—Ä–Ω—É—Ö–