<a href="https://colab.research.google.com/github/vvikasreddy/JargonAI/blob/main/ngp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import DataLoader, Dataset



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [9]:
# loading a dataset, for sanity
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")



In [11]:
# displaying a dataset

billsum = billsum.train_test_split(test_size=0.2)
# billsum["train"][0]

In [14]:
# Dataset class for preprocessing
class CustomDataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_input_length=512, max_target_length=128):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.input_texts[idx],
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze(), # labels used only for training
            "reference" : self.target_texts[idx] #  reference only used for eval
    }


In [5]:
# storing the values in the list.

input_texts = [billsum["train"][i]["text"] for i in range(100)]
# len(billsum["train"]
target_texts = [billsum["train"][i]["summary"] for i in range(100)]
# target_texts = [i["summary"] for i in billsum["train"]]  # Replace with actual target summaries
len(input_texts)

100

In [6]:
dataset = CustomDataset(tokenizer, input_texts, target_texts)

# initializing the dataloader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [7]:
# move to cuda, if available cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


  0%|          | 0/13 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 13/13 [00:09<00:00,  1.40it/s]


Epoch 1/3, Loss: 3.2835468512315016


100%|██████████| 13/13 [00:05<00:00,  2.21it/s]


Epoch 2/3, Loss: 3.0101493872129


100%|██████████| 13/13 [00:08<00:00,  1.62it/s]


Epoch 3/3, Loss: 2.9095636147719164


('fine_tuned_t5/tokenizer_config.json',
 'fine_tuned_t5/special_tokens_map.json',
 'fine_tuned_t5/spiece.model',
 'fine_tuned_t5/added_tokens.json',
 'fine_tuned_t5/tokenizer.json')

In [None]:

from tqdm import tqdm
# 6. Training loop
epochs = 3
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader)}")



## Evaludation code

In [16]:
# loading the val loader

# Replace with your test input and target texts
val_input_texts = [billsum["train"][i]["text"] for i in range(100)]
# len(billsum["train"]
val_tgt_texts = [billsum["train"][i]["summary"] for i in range(100)]

dataset = CustomDataset(tokenizer, input_texts, target_texts)

# initializing the dataloader
val_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [20]:
predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_new_tokens=128 , # or whatever number you want,return_dict_in_generate=True,  # Ensures the output is a dictionary-like object
            return_dict_in_generate=True,
            output_scores=True,            # Includes scores in the output
        output_logits=True             # Includes scores (logits) in the output
      )

        # Decode predictions and references
        predictions.extend(tokenizer.batch_decode(outputs[0], skip_special_tokens=True))
        references.extend(batch["reference"])

Evaluating: 100%|██████████| 7/7 [00:19<00:00,  2.76s/it]


In [28]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def BLEU_score_fast(references, predictions, weights=(0.5, 0.5, 0, 0)):

  """
  Averages the BLEU score over all the translations

  args:
    references: represents acutal translations
    translations: represents predicted translations
  returns:
    average BLEU score
  """
  tokenized_references = [[ref.split()] for ref in references]
  tokenized_predictions = [pred.split() for pred in predictions]

  # Calculate BLEU score
  bleu_score = corpus_bleu(tokenized_references, tokenized_predictions, weights=weights)
  return bleu_score

bleu_score = BLEU_score_fast(references, predictions)
print(f"BLEU Score: {bleu_score}")


BLEU Score: 0.6684908605885546


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


['this is a test', 'another example']