# Install Requirements

In [1]:
%%capture
%pip install unsloth transformers datasets accelerate torch evaluate bert_score rouge_score bitsandbytes
%pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Login to Hugging Face

In [None]:
from huggingface_hub import login

# Replace with your Hugging Face token
hf_token = ""
login(token=hf_token)

In [3]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model_alias = model_name.split('/')[-1].strip()
trainer_output_dir = f"/kaggle/working/{model_alias}_output"
trainer_log_dir = f"/kaggle/working/{model_alias}_logs"
savepath = f"/kaggle/working/custom-{model_alias}"

datapath = "/kaggle/input/springerjournal-450tk-0-7cosine/"

print("Save path:\t", savepath)
print("Log path:\t", trainer_log_dir)
print("Output path:\t", trainer_output_dir)

Save path:	 /kaggle/working/custom-Llama-3.2-1B-Instruct
Log path:	 /kaggle/working/Llama-3.2-1B-Instruct_logs
Output path:	 /kaggle/working/Llama-3.2-1B-Instruct_output


# Import and Pre-Configure Model

In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [5]:
import torch

print("CUDA is available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

CUDA is available: True
CUDA device count: 1
CUDA device name: Tesla P100-PCIE-16GB


In [6]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer
from datasets import Dataset
import pandas as pd
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-20 11:39:55.606492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747741195.828426      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747741195.894452      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


# Load Pre-Trained Model

In [7]:
max_seq_length = 512
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,
    token=hf_token
)

# Configure LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407
)

==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.5.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [8]:
# Define custom chat template
chat_template = "{% for message in messages %}"
chat_template += "{% if message['role'] == 'user' %}"
chat_template += "<|start|>user: {{ message['content'] }} <|end|>"
chat_template += "{% elif message['role'] == 'assistant' %}"
chat_template += "<|start|>assistant: {{ message['content'] }} <|end|>"
chat_template += "{% endif %}"
chat_template += "{% endfor %}"

def preprocess_function(examples):
    texts = []
    for abstract, title in zip(examples["abstract"], examples["title"]):
        if pd.notna(abstract) and pd.notna(title):
            abstract = str(abstract).strip()
            title = str(title).strip()
            if abstract and title:
                messages = [
                    {"role": "user", "content": f"Generate a concise and informative title based on this abstract:\n{abstract}"},
                    {"role": "assistant", "content": title}
                ]
                # Use custom chat template
                text = tokenizer.apply_chat_template(messages, chat_template=chat_template, tokenize=False)
                texts.append(text)
    return {"text": texts}

# Load Dataset

In [9]:
train_df = pd.read_csv(datapath + "train.csv").reset_index(drop=True)
val_df = pd.read_csv(datapath + "val.csv").reset_index(drop=True)
test_df = pd.read_csv(datapath + "test.csv").reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])

# Filter out None values
tokenized_train = tokenized_train.filter(lambda x: x["text"] is not None)
tokenized_val = tokenized_val.filter(lambda x: x["text"] is not None)
tokenized_test = tokenized_test.filter(lambda x: x["text"] is not None)

Map:   0%|          | 0/1783 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1783 [00:00<?, ? examples/s]

Filter:   0%|          | 0/223 [00:00<?, ? examples/s]

Filter:   0%|          | 0/223 [00:00<?, ? examples/s]

# Configure Training Parameters

In [10]:
training_args = TrainingArguments(
    output_dir=trainer_output_dir,
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=trainer_log_dir,
    logging_steps=200,
    fp16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    args=training_args,
    max_seq_length=max_seq_length,
    dataset_text_field="text",
    packing=False
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/1783 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/223 [00:00<?, ? examples/s]

# Start Training

In [11]:
print("\033[36mStarting training...\033[0m")
trainer.train()
print("\033[33mTraining complete!\033[0m")

[36mStarting training...[0m


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,783 | Num Epochs = 3 | Total steps = 669
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,2.3209,2.267602
2,2.2066,2.249699
3,2.1742,2.245591


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


[33mTraining complete![0m


# Save Trained Model

In [12]:
model.save_pretrained(savepath)
tokenizer.save_pretrained(savepath)

('/kaggle/working/custom-Llama-3.2-1B-Instruct/tokenizer_config.json',
 '/kaggle/working/custom-Llama-3.2-1B-Instruct/special_tokens_map.json',
 '/kaggle/working/custom-Llama-3.2-1B-Instruct/tokenizer.json')

# Clear VRAM

In [13]:
try:
    import gc
    del trainer
    del model
    torch.cuda.empty_cache()
    gc.collect()
except Exception as e:
    print(e)

# Load Trained Model

In [14]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=savepath,
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [15]:
# Function to preprocess test dataset for inference
def collate_fn(batch):
    texts = []
    labels = []
    for item in batch:
        text = item["text"]
        # Split on the assistant delimiter based on the chat template
        user_part = text.rsplit("<|start|>assistant:", 1)[0].strip()
        assistant_part = text.rsplit("<|start|>assistant:", 1)[1].split("<|end|>")[0].strip()
        texts.append(user_part)
        labels.append(assistant_part)
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt").to("cuda")
    return {"inputs": inputs, "labels": labels}

# Run Inference

In [16]:
eval_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=collate_fn)

predictions = []
references = []

# Run inference
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        inputs = batch["inputs"]
        outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
        pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # Extract only the assistant part from predictions
        pred_texts = [text.split("<|start|>assistant:")[-1].split("<|end|>")[0].strip() if "<|start|>assistant:" in text else text.strip() for text in pred_texts]
        predictions.extend(pred_texts)
        references.extend(batch["labels"])

100%|██████████| 28/28 [04:15<00:00,  9.13s/it]


# Evaluate

In [17]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=predictions, references=references)
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

from IPython.display import clear_output
clear_output()

# Print results
print("ROUGE:", rouge_scores)
print("BERTScore (averaged):")
print("  Precision:", sum(bert_scores["precision"]) / len(bert_scores["precision"]))
print("  Recall:", sum(bert_scores["recall"]) / len(bert_scores["recall"]))
print("  F1:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))

ROUGE: {'rouge1': 0.11718465749782493, 'rouge2': 0.07512881197739413, 'rougeL': 0.10089812601250422, 'rougeLsum': 0.10207272408029315}
BERTScore (averaged):
  Precision: 0.751530661176673
  Recall: 0.8757518567311924
  F1: 0.8080637078114155
