In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import wandb

In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    r=8,                       
    lora_alpha=32,            
    lora_dropout=0.1,          
    target_modules=["q_proj", "v_proj"] 
)
model = get_peft_model(model, lora_config)
print("LoRA trainable parameters:")
model.print_trainable_parameters()

In [None]:
data_files = {
    "train": "merged_final_combined.json",
    "test": "test_ivanchyk.json"  
}
dataset = load_dataset("json", data_files=data_files)

In [None]:
def format_example(example):
    prompt = (
        "Translate the following Ukrainian sentence into the Hutsul dialect:\n\n"
        f"{example['source']}\n\n"
        "Answer:"
    )
    full_text = prompt + " " + example["target"]
    return {"text": full_text}


In [None]:
formatted_dataset = dataset.map(format_example, remove_columns=dataset["train"].column_names)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./llama-translation-lora-finetuned-s",
    overwrite_output_dir=True,
    num_train_epochs=3,                  
    per_device_train_batch_size=1,        
    gradient_accumulation_steps=64,      
    learning_rate=5e-5,                 
    weight_decay=0.01,
    fp16=True,                          
    save_total_limit=3,
    logging_steps=10,
    save_steps=100,
    report_to=["wandb"]                 
)

In [None]:
class AveragingTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs.loss
        input_ids = inputs.get("input_ids")
        if input_ids is not None:
            loss = loss / input_ids.shape[1]
        return (loss, outputs) if return_outputs else loss

trainer = AveragingTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset.get("test"),
    data_collator=data_collator
)

trainer.train()

model.save_pretrained("./llama-translation-lora-finetuned-s")
tokenizer.save_pretrained("./llama-translation-lora-finetuned-s")