In [None]:
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel

In [None]:
dataset_file = "pirates_dataset_dynamic.jsonl"
output_json = "pirates_dataset.json"

In [None]:
conversations = []

with open(dataset_file, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        messages = [
            {"role": "system", "content": "Ты — эксперт по 'Пиратам Карибского моря'. Отвечай чётко и по делу."},
            {"role": "user", "content": data["prompt"]},
            {"role": "assistant", "content": data["response"]}
        ]
        conversations.append({"messages": messages})

with open(output_json, "w", encoding="utf-8") as f:
    json.dump(conversations, f, ensure_ascii=False, indent=4)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
model_name = "Vikhrmodels/QVikhr-2.5-1.5B-Instruct-r"
dataset_path = "pirates_dataset.json"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, lora_config)
model.to(device)

In [None]:
dataset = load_dataset("json", data_files=dataset_path)["train"]

def tokenize_data(examples):
    max_length = 512  
    input_ids_list = []
    attention_mask_list = []
    for messages in examples["messages"]:
        tokenized = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
        input_ids = tokenized[0][:max_length].tolist()
        attention_mask = [1] * len(input_ids)
        
        padding_length = max_length - len(input_ids)
        if padding_length > 0:
            input_ids += [tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
        
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
            
    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
    }

In [None]:
tokenized_dataset = dataset.map(tokenize_data, batched=True, remove_columns=dataset.column_names)

In [None]:
training_args = TrainingArguments(
    output_dir="./vikhr_pirates",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4, 
    num_train_epochs=7, 
    save_strategy="epoch",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    optim="adamw_torch",
    overwrite_output_dir=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [None]:
trainer.train()
model.save_pretrained("vikhr_pirates_finetuned")
tokenizer.save_pretrained("vikhr_pirates_finetuned")

In [None]:
def generate_response(model, tokenizer, prompt, max_length=150):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model_name = "Vikhrmodels/QVikhr-2.5-1.5B-Instruct-r"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)

peft_model_path = "vikhr_pirates_finetuned"
config = PeftConfig.from_pretrained(peft_model_path)
ft_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path).to(device)
ft_model = PeftModel.from_pretrained(ft_model, peft_model_path).to(device)
ft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [None]:
test_prompts = [
    "Расскажи про вилла тернера",
    "Что за пират осминог"
]

print("\n=== Сравнение ответов на вопросы из кино ===\n")
for idx, prompt in enumerate(test_prompts, start=1):
    print(f"\n{'='*60}\nВопрос {idx}: {prompt}\n{'='*60}")
    
    print("\n>>> Ответ базовой модели:")
    base_response = generate_response(base_model, base_tokenizer, prompt, max_length=150)
    print(base_response)
    
    print("\n>>> Ответ дообученной модели:")
    ft_response = generate_response(ft_model, ft_tokenizer, prompt, max_length=150)
    print(ft_response)
    
    print("\n" + "-"*80 + "\n")
