In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
import torch
import numpy as np
from trl import SFTTrainer

# Add the necessary global before loading the checkpoint
torch.serialization.add_safe_globals([np.core.multiarray._reconstruct])

2025-04-22 16:16:02.982306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745338563.202853      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745338563.262077      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Load tokenizer and model
model_name = "flax-community/papuGaPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Load model without quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    weights_only=False
).to("cuda")  # Directly load on GPU

In [None]:
# Add LoRA adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],  # GPT2-specific
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model = model.to("cuda")
print(f"Model moved to device: {next(model.parameters()).device}")

In [None]:
# Load tokenized dataset
dataset = load_from_disk("/kaggle/input/polish-jokes-tokenised-polish-jokes/tokenized_polish_jokes")

In [None]:
from transformers import TrainerCallback
import random

class JokeLoggerCallback(TrainerCallback):
    def __init__(self, tokenizer, prompt_list, log_every=500, max_new_tokens=40):
        self.tokenizer = tokenizer
        self.prompt_list = prompt_list
        self.log_every = log_every
        self.max_new_tokens = max_new_tokens

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.log_every == 0 and state.global_step != 0:
            model = kwargs["model"]
            prompt = random.choice(self.prompt_list)
            inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    do_sample=True,
                    top_k=50,
                    top_p=0.95,
                    temperature=1.0
                )
            print(f"\n--- Sample joke @ step {state.global_step} ---")
            print(self.tokenizer.decode(outputs[0], skip_special_tokens=True))
            print("-" * 50)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qlora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    logging_steps=10,
    save_steps=500,
    save_total_limit=5,
    num_train_epochs=18,
    learning_rate=7e-5,
    fp16=True,
    report_to="none",
    save_strategy="steps",
    resume_from_checkpoint=True
)

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define sample prompts (Polish joke starters)
sample_prompts = [
    "Przychodzi baba do lekarza i mówi",
    "Dlaczego blondynka weszła do sklepu",
    "Jasiu pyta nauczycielkę",
    "Facet wchodzi do baru i widzi"
]

callbacks = [JokeLoggerCallback(tokenizer, sample_prompts, log_every=150)]

In [None]:
# Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    callbacks=callbacks
)

# Sanity check
print("Trainer device:", training_args.device)
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(0))

# Train
trainer.train()

# Save
trainer.save_model("polish-joke-gpt-lora")