In [1]:
!pip uninstall simpletransformers -y
!pip install -U transformers datasets peft bitsandbytes accelerate google-api-python-client google-auth-httplib2 google-auth-oauthlib


from google.colab import drive
from google.colab import auth
from datasets import Dataset, Features, Value
from googleapiclient.discovery import build
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer
from transformers import (
    MT5ForConditionalGeneration, MT5Tokenizer,
    LEDForConditionalGeneration, LEDTokenizer,
    PegasusForConditionalGeneration, PegasusTokenizer
)
import json
import torch
import os
import google.auth


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#authenticate to google drive and mount drive
auth.authenticate_user()
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
#connect with google drive for cleaning space
creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/drive"])
drive_service = build('drive', 'v3', credentials=creds)

In [5]:
#training arguments
def get_training_args(output_dir):
    return Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=4,
        save_strategy="epoch",
        learning_rate=5e-6,
        lr_scheduler_type="linear",
        logging_dir=os.path.join(output_dir, "logs"),
        fp16=torch.cuda.is_available(),
        bf16=False,
        save_total_limit=2,
        remove_unused_columns=False,
        predict_with_generate=True,
        load_best_model_at_end=True,
        eval_strategy="epoch",
        metric_for_best_model="eval_loss",
        logging_steps=100,
        label_smoothing_factor=0.1
    )

In [6]:
# load datset
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/role_aware_squad.json",
    field=None
)

# Split into train (80%), val (10%), test (10%)
train_val = dataset["train"]
train_val = train_val.select(range(1000))
train_val = train_val.train_test_split(test_size=0.2, seed=42)
train_dataset = train_val["train"]
val_test = train_val["test"].train_test_split(test_size=0.5, seed=42)

eval_dataset = val_test["train"]  # validation during training
test_dataset = val_test["test"]   # final testing

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# format preprocessed dataset
def gen(ds):
    for ex in ds:
        for role in ["EMPLOYER", "EMPLOYEE", "CUSTOMER"]:
            yield {
                "role": role,
                "context": ex["original_context"],
                "masked_context": ex["role_contexts"][role],
            }

features = Features({
    "role": Value("string"),
    "context": Value("string"),
    "masked_context": Value("string"),
})

format_train_dataset = Dataset.from_generator(lambda: gen(train_dataset), features=features)
format_eval_dataset  = Dataset.from_generator(lambda: gen(eval_dataset),  features=features)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
#preprocess function to create prompts and labels
def make_preprocess_function(tokenizer, model_name):
  def preprocess_function(examples):
      inputs = [f"Please mask all PERSON, ORG, LOCATION, and DATE entities from the following text, considering the role {role}:\n{context}"
                for role, context in zip(examples["role"], examples["context"])]
      targets = examples["masked_context"]
      model_inputs = tokenizer(inputs, max_length=1024 if "led" in model_name else 512, truncation=True, padding="max_length")

      with tokenizer.as_target_tokenizer():
        labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")
      labels["input_ids"] = [
            [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
            for label_seq in labels["input_ids"]
        ]

      model_inputs["labels"] = labels["input_ids"]
      if "led" in model_name:
            model_inputs["global_attention_mask"] = [
                [1] + [0] * (len(seq) - 1) for seq in model_inputs["input_ids"]
            ]


      return model_inputs
  return preprocess_function


In [9]:
#train model
def train_model(model_cls, tokenizer_cls, pretrained_name, output_dir):
    tokenizer = tokenizer_cls.from_pretrained(pretrained_name)
    model = model_cls.from_pretrained(pretrained_name).to(device)
    model.gradient_checkpointing_enable()

    preprocess_function = make_preprocess_function(tokenizer, pretrained_name)
    tokenized_train_dataset = format_train_dataset.map(preprocess_function, batched=True, remove_columns=format_train_dataset.column_names)
    tokenized_eval_dataset = format_eval_dataset.map(preprocess_function, batched=True, remove_columns=format_eval_dataset.column_names)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100)

    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=get_training_args(output_dir),
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        data_collator=data_collator
    )
    torch.cuda.empty_cache()
    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"✅ Saved model to {output_dir}\n")


In [None]:
#list of used models and configurations
model_configs = [
     {
      "name": "BART-Base",
       "model_cls": BartForConditionalGeneration,
        "tokenizer_cls": BartTokenizer,
        "pretrained": "facebook/bart-base",
       "out_dir": "/content/drive/MyDrive/models/role-aware-rag/bart-base"
    },
    {
        "name": "DistilBART",
       "model_cls": BartForConditionalGeneration,
       "tokenizer_cls": BartTokenizer,
        "pretrained": "sshleifer/distilbart-cnn-12-6",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/distilbart"
    },
    {
        "name": "T5-Base",
        "model_cls": T5ForConditionalGeneration,
        "tokenizer_cls": T5Tokenizer,
        "pretrained": "t5-base",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/t5-base"
    },

    {
        "name": "LED-Base",
       "model_cls": LEDForConditionalGeneration,
      "tokenizer_cls": LEDTokenizer,
        "pretrained": "allenai/led-base-16384",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/led-base"
    }
]


In [None]:
import os
import shutil
from googleapiclient.discovery import build
import google.auth

#remove checkpoints from google drive
def delete_local_checkpoints(output_dir="/content/drive/MyDrive/models/role-aware-rag"):
    for name in os.listdir(output_dir):
        if name.startswith("checkpoint-"):
            path = os.path.join(output_dir, name)
            if os.path.isdir(path):
                shutil.rmtree(path)

#empty google drive trash due to limited space
def empty_google_drive_trash():
    creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/drive"])
    drive_service = build('drive', 'v3', credentials=creds)

    drive_service.files().emptyTrash().execute()


def full_model_cleanup(out_dir):
    delete_local_checkpoints(out_dir)
    empty_google_drive_trash()

In [None]:
#train all models and clean space from google drive
for cfg in model_configs:
    train_model(cfg["model_cls"], cfg["tokenizer_cls"], cfg["pretrained"], cfg["out_dir"])
    full_model_cleanup(cfg["out_dir"])



tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,1.6263,1.474193
2,1.4522,1.455947
3,1.4438,1.449013
4,1.4383,1.448335


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['led.encoder.embed_tokens.weight', 'led.decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Saved model to /content/drive/MyDrive/models/role-aware-rag/led-base

🧹 Deleting checkpoint folders...
✅ Deleted: /content/drive/MyDrive/models/role-aware-rag/led-base/checkpoint-450
✅ Deleted: /content/drive/MyDrive/models/role-aware-rag/led-base/checkpoint-600
🗑️ Emptying Google Drive trash...
✅ Trash emptied.
