# Дообучение собственной модели для Botkisser-а

Датасет-экземпляр находится в файле dataset.txt

In [1]:
import pandas as pd

file_path = "./dataset.txt"
with open(file_path, "r", encoding="utf-8") as file:
    data = file.read().split("---")

data = [pair.strip() for pair in data]

df = pd.DataFrame(data, columns=["text"])

print(df)

                                                text
0  <s>\n\nA brown fox jumped over the lazy dog\n\...
1                         <s>\n\nA silly cat\n\n</s>


In [2]:
# Save DataFrame to CSV file
csv_file_path = "./dataset.csv"
df.to_csv(csv_file_path, index=False)

In [3]:
# Settings
model_name = "sberbank-ai/rugpt3medium_based_on_gpt2"
train_file_path = "./dataset.csv"
output_dir = "./models/bk-custom"
num_train_epochs = 6
per_device_train_batch_size = 5
save_steps = 10_000
save_total_limit = 3
special_tokens = {"bos_token": "<s>", "eos_token": "</s>", 'pad_token':'[PAD]', 'sep_token': '[SEP]'}

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import pandas as pd

# Load pre-trained model and tokenizer
print("Loading model and tokenizer...")

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Use special tokens
if special_tokens != {}:
    print("Adding special tokens...")

    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))

# Load training data
print("Loading training data...")

train_data = pd.read_csv(train_file_path)
train_text = train_data["text"].tolist()

# Tokenize the training data
train_encodings = tokenizer(train_text, return_tensors="pt", truncation=True, padding=True)

# Create a dataset and data collator for language modeling
print("Getting the dataset and trainer ready...")

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_file_path,
    block_size=128,
    overwrite_cache=True,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
print("Training model!!")

trainer.train()

# Save the fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model done")

Обучалка тут!!!


# Запуск нейросети

In [7]:
# Settings
output_dir = "./models/bk-delta"

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

print("Loading model...")

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

print("Getting device...")

# Set the device (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to generate text based on a prompt
def generate_text(prompt, max_length=50, num_return_sequences=1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=num_return_sequences)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text


Loading model...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Getting device...


In [22]:
print("Generating text...")

# Test the fine-tuned model
prompt = "\n\n[USR] (Стуль) Счастье - это когда ты ешь мороженое, а потом идёшь в туалет и оно выходит в виде радуги. [/USR]\n[ANS]"
generated_text = generate_text(prompt, max_length=100, num_return_sequences=1)

# Print the generated text
print("Generated Text:")
print(generated_text)

Generating text...
Generated Text:


[USR] (Стуль) Счастье - это когда ты ешь мороженое, а потом идёшь в туалет и оно выходит в виде радуги. [/USR]
[ANS] счастье - это когда тебе дают много денег, а потом ты их крадёшь [/ANS]
[USR] (Стул) Счастье - это когда тебе дают много денег, а потом ты их крадёшь [/USR]
[ANS] счастье
