<a href="https://colab.research.google.com/github/wuabs/child-llm-assistant/blob/main/training/colab_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model


In [None]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = prepare_model_for_kbit_training(model)




In [None]:
# LoRA config
with open("lora_config.json", "r", encoding="utf-8") as f:
    config_dict = json.load(f)
lora_config = LoraConfig(**config_dict)
model = get_peft_model(model, lora_config)

In [None]:
# Датасет
raw_dataset = load_dataset("text", data_files="data/lm_dataset.txt")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = raw_dataset.map(tokenize_function)


In [None]:

# Аргументы
training_args = TrainingArguments(
    output_dir="./models/lora-lm",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=20,
    save_steps=200,
    fp16=True,
    save_total_limit=1
)


In [None]:
# Обучение
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)
trainer.train()



In [None]:
# Сохраняем модель
model.save_pretrained("./models/lora-lm")
tokenizer.save_pretrained("./models/lora-lm")