In [1]:
# Install Requirements
%pip install transformers datasets accelerate torch evaluate bert_score rouge_score bitsandbytes


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metad

In [None]:
from huggingface_hub import login
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers.trainer import Trainer
from torch.cuda.amp import autocast
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Login to Hugging Face
hf_token = ""
login(token=hf_token)

# Set paths
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model_alias = model_name.split('/')[-1].strip()
trainer_output_dir = f"/kaggle/working/{model_alias}_output"
trainer_log_dir = f"/kaggle/working/{model_alias}_logs"
savepath = f"/kaggle/working/custom-{model_alias}"
datapath = "/kaggle/input/springerjournal-450tk-0-7cosine/"

print("Save path:\t", savepath)
print("Log path:\t", trainer_log_dir)
print("Output path:\t", trainer_output_dir)

# Pre-Configure
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("CUDA is available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
import gc
gc.collect()

# Define 4-bit quantization config for training
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# Load Pre-Trained Model with 4-bit Quantization
max_seq_length = 512
device_map = {"": "cuda:0"}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map=device_map,
    token=hf_token
)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Modules to apply LoRA
    lora_dropout=0.05,  # Dropout for LoRA layers
    bias="none",  # Bias configuration
    task_type="CAUSAL_LM"  # Task type for causal language modeling
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Verify trainable parameters

# Fix padding token issue
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("Set pad_token to eos_token:", tokenizer.pad_token)

# Define custom chat template
chat_template = "{% for message in messages %}"
chat_template += "{% if message['role'] == 'user' %}"
chat_template += "<|start|>user: {{ message['content'] }} <|end|>"
chat_template += "{% elif message['role'] == 'assistant' %}"
chat_template += "<|start|>assistant: {{ message['content'] }} <|end|>"
chat_template += "{% endif %}"
chat_template += "{% endfor %}"

def preprocess_function(examples):
    texts = []
    for abstract, title in zip(examples["abstract"], examples["title"]):
        if pd.notna(abstract) and pd.notna(title):
            abstract = str(abstract).strip()
            title = str(title).strip()
            if abstract and title:
                messages = [
                    {"role": "user", "content": f"Generate a concise and informative title based on this abstract:\n{abstract}"},
                    {"role": "assistant", "content": title}
                ]
                text = tokenizer.apply_chat_template(messages, chat_template=chat_template, tokenize=False)
                texts.append(text)
    return {"text": texts}

# Load Dataset
train_df = pd.read_csv(datapath + "train.csv").reset_index(drop=True)
val_df = pd.read_csv(datapath + "val.csv").reset_index(drop=True)
test_df = pd.read_csv(datapath + "test.csv").reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["abstract", "title"])

# Filter out None values
tokenized_train = tokenized_train.filter(lambda x: x["text"] is not None)
tokenized_val = tokenized_val.filter(lambda x: x["text"] is not None)
tokenized_test = tokenized_test.filter(lambda x: x["text"] is not None)

# Tokenize for Training
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

tokenized_train = tokenized_train.map(tokenize_function, batched=True)
tokenized_val = tokenized_val.map(tokenize_function, batched=True)
tokenized_test = tokenized_test.map(tokenize_function, batched=True)

# Custom Trainer to handle autocast
class CustomTrainer(Trainer):
    def training_step(self, model, inputs, num_items_in_batch):
        model.train()
        inputs = self._prepare_inputs(inputs)
        with torch.amp.autocast('cuda'):
            loss = self.compute_loss(model, inputs)
        self.accelerator.backward(loss)
        return loss.detach() / self.args.gradient_accumulation_steps

# Configure Training Parameters
training_args = TrainingArguments(
    output_dir=trainer_output_dir,
    save_total_limit=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=trainer_log_dir,
    logging_steps=200,
    fp16=True,
    report_to="none",
    optim="adamw_8bit"
)

trainer = CustomTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=lambda data: {
        "input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in data]),
        "attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in data]),
        "labels": torch.stack([torch.tensor(f["input_ids"]) for f in data])
    }
)

# Start Training
print("\033[36mStarting training...\033[0m")
trainer.train()
print("\033[33mTraining complete!\033[0m")

# Save Trained Model
model.save_pretrained(savepath)
tokenizer.save_pretrained(savepath)

# Clear VRAM
try:
    import gc
    del trainer
    del model
    torch.cuda.empty_cache()
    gc.collect()
except Exception as e:
    print(e)

# Load Trained Model for Inference with 4-bit Quantization
model = AutoModelForCausalLM.from_pretrained(
    savepath,
    quantization_config=quant_config,
    device_map=device_map
)
tokenizer = AutoTokenizer.from_pretrained(savepath)

# Function to preprocess test dataset for inference
def collate_fn(batch):
    texts = []
    labels = []
    for item in batch:
        text = item["text"]
        user_part = text.rsplit("<|start|>assistant:", 1)[0].strip()
        assistant_part = text.rsplit("<|start|>assistant:", 1)[1].split("<|end|>")[0].strip()
        texts.append(user_part)
        labels.append(assistant_part)
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_seq_length, return_tensors="pt").to("cuda")
    return {"inputs": inputs, "labels": labels}

# Run Inference
eval_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=collate_fn)

predictions = []
references = []

for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        with torch.amp.autocast('cuda'):
            inputs = batch["inputs"]
            outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
            pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            pred_texts = [text.split("<|start|>assistant:")[-1].split("<|end|>")[0].strip() if "<|start|>assistant:" in text else text.strip() for text in pred_texts]
            predictions.extend(pred_texts)
            references.extend(batch["labels"])

# Evaluate
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=predictions, references=references)
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

from IPython.display import clear_output
clear_output()

# Print results
print("ROUGE:", rouge_scores)
print("BERTScore (averaged):")
print("  Precision:", sum(bert_scores["precision"]) / len(bert_scores["precision"]))
print("  Recall:", sum(bert_scores["recall"]) / len(bert_scores["recall"]))
print("  F1:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))

ROUGE: {'rouge1': 0.12279784357413287, 'rouge2': 0.08207388126586282, 'rougeL': 0.10929451158102316, 'rougeLsum': 0.11105000485172684}
BERTScore (averaged):
  Precision: 0.8017162945238464
  Recall: 0.8838060471509069
  F1: 0.8406263651334651
