# INSTALL REQUIREMENTS

In [1]:
%%capture
%pip install transformers datasets accelerate torch evaluate bert_score rouge_score bitsandbytes

In [2]:
model_name = "openai-community/gpt2"
model_alias = model_name.split('/')[-1].strip()
trainer_output_dir = f"/kaggle/working/{model_alias}_output"
trainer_log_dir = f"/kaggle/working/{model_alias}_logs"
savepath = f"/kaggle/working/custom-{model_alias}"

datapath = "/kaggle/input/springer-journal-final/"

print("Save path:\t",savepath)
print("Log path:\t", trainer_log_dir)
print("Output path:\t",trainer_output_dir)

Save path:	 /kaggle/working/custom-gpt2
Log path:	 /kaggle/working/gpt2_logs
Output path:	 /kaggle/working/gpt2_output


## IMPORT AND PRE-CONFIGURE MODEL

In [3]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
import torch

print("CUDA is available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

CUDA is available: True
CUDA device count: 1
CUDA device name: Tesla P100-PCIE-16GB


In [5]:
import evaluate
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)

# Load pre-trained model

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    inputs = tokenizer(examples["abstract"], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples["title"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    return inputs

## LOAD DATASET

In [8]:
train_df = pd.read_csv(datapath + "train.csv").reset_index(drop=True)
val_df = pd.read_csv(datapath + "val.csv").reset_index(drop=True)
test_df = pd.read_csv(datapath + "test.csv").reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/45607 [00:00<?, ? examples/s]

Map:   0%|          | 0/5701 [00:00<?, ? examples/s]

Map:   0%|          | 0/5701 [00:00<?, ? examples/s]

## CONFIGURE TRAINING PARAMETERS

In [9]:
training_args = TrainingArguments(
    fp16=False,
    bf16=True,
    output_dir=trainer_output_dir,
    save_total_limit=2,
    eval_strategy="epoch", # "no"/"epoch" to disable/enable validation
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=trainer_log_dir,
    logging_steps=200,
    report_to="none"
)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 không dùng masked language modeling (MLM)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  # Using validation dataset for evaluation
    data_collator=data_collator,
)

## START TRAINING

In [10]:
print("\033[36mStarting training...\033[0m")
trainer.train()
print("\033[33mTraining complete!\033[0m")

[36mStarting training...[0m


Epoch,Training Loss,Validation Loss
1,3.1581,3.08423
2,3.0762,3.035247
3,3.0827,3.021196


[33mTraining complete![0m


# Save trained model

In [11]:
model.save_pretrained(savepath)
tokenizer.save_pretrained(savepath)

('/kaggle/working/custom-gpt2/tokenizer_config.json',
 '/kaggle/working/custom-gpt2/special_tokens_map.json',
 '/kaggle/working/custom-gpt2/vocab.json',
 '/kaggle/working/custom-gpt2/merges.txt',
 '/kaggle/working/custom-gpt2/added_tokens.json',
 '/kaggle/working/custom-gpt2/tokenizer.json')

# Clear VRAM

In [12]:
try:
    import gc

    del trainer
    del model  # If you explicitly defined it outside the trainer
    torch.cuda.empty_cache()  # Clears PyTorch's CUDA memory cache
    gc.collect()  # Forces Python garbage collection
except Exception as e:
    print(e)

# Load trained model


In [13]:
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(savepath,quantization_config=quantization_conf, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(savepath)

In [14]:
# Function to preprocess dataset correctly
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch]).to("cuda")
    labels = torch.tensor([item["labels"] for item in batch]).to("cuda")
    return {"input_ids": input_ids, "labels": labels}

eval_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=collate_fn)

# Run inference

In [15]:
predictions = []
references = []

# Run inference
for batch in tqdm(eval_dataloader):
    with torch.no_grad():
        outputs = model.generate(batch["input_ids"])
        
        pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ref_texts = tokenizer.batch_decode(batch["labels"].tolist(), skip_special_tokens=True)

        predictions.extend(pred_texts)
        references.extend(ref_texts)

  0%|          | 0/713 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 1/713 [00:00<07:29,  1.58it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-

# Evaluating

In [16]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

rouge_scores = rouge.compute(predictions=predictions, references=references)
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="en")

from IPython.display import clear_output
clear_output()

# Print results
print("ROUGE:", rouge_scores)
print("BERTScore (averaged):")
print("  Precision:", sum(bert_scores["precision"]) / len(bert_scores["precision"]))
print("  Recall:", sum(bert_scores["recall"]) / len(bert_scores["recall"]))
print("  F1:", sum(bert_scores["f1"]) / len(bert_scores["f1"]))

ROUGE: {'rouge1': 0.09502812237092614, 'rouge2': 0.049090479364315334, 'rougeL': 0.0756607779070333, 'rougeLsum': 0.07564772786231289}
BERTScore (averaged):
  Precision: 0.7976889918507494
  Recall: 0.881455287172635
  F1: 0.8373807188168553
