In [1]:
# Install Requirements
%pip install unsloth transformers datasets accelerate torch evaluate bert_score rouge_score bitsandbytes
%pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting unsloth
  Downloading unsloth-2025.5.7-py3-none-any.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting unsloth_zoo>=2025.5.8 (from unsloth)
  Downloading unsloth_zoo-2025.5.8-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.21-py3-none-any.whl.metadata (10 kB)
Collecting trl!=0.15.0,!=0.9

In [None]:
from huggingface_hub import login
import os
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
import pandas as pd
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import AdamW

# Enable CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# Login to Hugging Face
hf_token = ""
login(token=hf_token)

# Set paths
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model_alias = model_name.split('/')[-1].strip()
savepath = f"/kaggle/working/custom-{model_alias}"
datapath = "/kaggle/input/springerjournal-450tk-0-7cosine/"

print("Save path:\t", savepath)
print("Data path:\t", datapath)

# Pre-Configure
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

print("CUDA is available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))
print("GPU memory available:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

# Load Pre-Trained Model
max_seq_length = 512
device_map = {"": "cuda:0"}
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=torch.float16,  # Required by unsloth
    load_in_4bit=False,  # Full fine-tuning
    device_map=device_map,
    token=hf_token,
    use_gradient_checkpointing=True  # Memory efficiency
)

# Fix padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("Set pad_token to eos_token:", tokenizer.pad_token)

# Define chat template
chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<|start|>user: {{ message['content'] }} <|end|>{% elif message['role'] == 'assistant' %}<|start|>assistant: {{ message['content'] }} <|end|>{% endif %}{% endfor %}"
def preprocess_function(examples):
    texts = []
    for abstract, title in zip(examples["abstract"], examples["title"]):
        if pd.notna(abstract) and pd.notna(title):
            abstract = str(abstract).strip()
            title = str(title).strip()
            if abstract and title:
                messages = [
                    {"role": "user", "content": f"Generate a concise and informative title based on this abstract:\n{abstract}"},
                    {"role": "assistant", "content": title}
                ]
                text = tokenizer.apply_chat_template(messages, chat_template=chat_template, tokenize=False)
                texts.append(text)
            else:
                print(f"Skipping empty abstract or title: abstract='{abstract}', title='{title}'")
        else:
            print("Skipping NaN abstract or title")
    return {"text": texts}

# Load and clean dataset
train_df = pd.read_csv(datapath + "train.csv").dropna().reset_index(drop=True)
val_df = pd.read_csv(datapath + "val.csv").dropna().reset_index(drop=True)
test_df = pd.read_csv(datapath + "test.csv").dropna().reset_index(drop=True)
print("Train dataset size:", len(train_df))
print("Validation dataset size:", len(val_df))
print("Test dataset size:", len(test_df))

# Verify dataset
print("Sample train data:", train_df.head())
print("NaN in train:", train_df.isnull().sum())
print("NaN in val:", val_df.isnull().sum())
print("NaN in test:", test_df.isnull().sum())

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])

# Filter empty values
tokenized_train = tokenized_train.filter(lambda x: x["text"] is not None and len(x["text"].strip()) > 0)
tokenized_val = tokenized_val.filter(lambda x: x["text"] is not None and len(x["text"].strip()) > 0)
tokenized_test = tokenized_test.filter(lambda x: x["text"] is not None and len(x["text"].strip()) > 0)
print("Tokenized train dataset size:", len(tokenized_train))
print("Tokenized validation dataset size:", len(tokenized_val))
print("Tokenized test dataset size:", len(tokenized_test))

# Print sample tokenized data
print("Sample tokenized train:", tokenized_train[0]["text"])

# Tokenize and retain text field for test dataset
def tokenize_function(examples):
    encodings = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
        return_tensors="pt"  # Return PyTorch tensors
    )
    encodings["text"] = examples["text"]  # Retain text field
    return encodings

# Tokenize train and val without retaining text
tokenized_train = tokenized_train.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = tokenized_val.map(tokenize_function, batched=True, remove_columns=["text"])
# Tokenize test with text field retained
tokenized_test = tokenized_test.map(tokenize_function, batched=True)

# Custom collate function for training
def train_collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]).squeeze() for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]).squeeze() for item in batch])
    labels = input_ids.clone()  # For language modeling, labels are the same as input_ids
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Custom Training Loop
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
model.train()
num_epochs = 3
gradient_accumulation_steps = 2
step = 0

print("\033[36mStarting custom training loop...\033[0m")
for epoch in range(num_epochs):
    train_loader = DataLoader(tokenized_train, batch_size=1, shuffle=True, collate_fn=train_collate_fn)
    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
        try:
            inputs = {k: v.to("cuda") for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            if loss is not None and torch.isfinite(loss):
                loss = loss / gradient_accumulation_steps
                loss.backward()
                if (i + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    step += 1
                    print(f"Step {step}, Loss: {loss.item() * gradient_accumulation_steps}")
            else:
                print(f"Skipping batch due to invalid loss: {loss}")
        except Exception as e:
            print(f"Error in batch: {e}")
            continue

# Save model
model.save_pretrained(savepath)
tokenizer.save_pretrained(savepath)

# Clear VRAM
import gc
del model, optimizer
torch.cuda.empty_cache()
gc.collect()

# Load for inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=savepath,
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,  # Quantize for inference
    device_map=device_map
)
FastLanguageModel.for_inference(model)

# Inference
def inference_collate_fn(batch):
    texts = []
    labels = []
    for item in batch:
        text = item["text"]  # Text field is retained
        user_part = text.rsplit("<|start|>assistant:", 1)[0].strip()
        assistant_part = text.rsplit("<|start|>assistant:", 1)[1].split("<|end|>")[0].strip()
        texts.append(user_part)
        labels.append(assistant_part)
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt").to("cuda")
    return {"inputs": inputs, "labels": labels}

eval_dataloader = DataLoader(tokenized_test, batch_size=4, collate_fn=inference_collate_fn)  # Reduced batch size
predictions = []
references = []

for batch in tqdm(eval_dataloader, desc="Inference"):
    try:
        with torch.no_grad():
            inputs = batch["inputs"]
            # Validate inputs
            print(f"Input IDs shape: {inputs['input_ids'].shape}, Attention Mask shape: {inputs['attention_mask'].shape}")
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                use_cache=True,
                do_sample=False,  # Use greedy decoding to avoid multinomial
                num_beams=1,
                temperature=1.0,
                top_p=1.0
            )
            pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            pred_texts = [text.split("<|start|>assistant:")[-1].split("<|end|>")[0].strip() if "<|start|>assistant:" in text else text.strip() for text in pred_texts]
            predictions.extend(pred_texts)
            references.extend(batch["labels"])
    except Exception as e:
        print(f"Error in inference batch: {e}")
        continue

# Evaluate
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
rouge_scores = rouge.compute(predictions=predictions, references=references)
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

from IPython.display import clear_output
clear_output()

print("ROUGE:", rouge_scores)
print("BERTScore (averaged):")
print("  Precision:", sum(bert_scores["precision"]) / len(bert_scores["precision"]))
print("  Recall:", sum(bert_scores["recall"]) / len(bert_scores["recall"]))
print("  F1:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))

ROUGE: {'rouge1': 0.10894446493150708, 'rouge2': 0.07079686901715211, 'rougeL': 0.09573865159647102, 'rougeLsum': 0.09711396706329989}
BERTScore (averaged):
  Precision: 0.7858892607047419
  Recall: 0.8820227601068437
  F1: 0.8311356249946116
