### Install Dependencies

In [1]:
!pip install transformers
!pip install evaluate
!pip install rouge
!pip install langchain
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install -U nlp

!pip install evaluate
!pip install dill==0.3.5.1

Collecting dill (from evaluate)
  Using cached dill-0.3.7-py3-none-any.whl (115 kB)
Installing collected packages: dill
  Attempting uninstall: dill
    Found existing installation: dill 0.3.5.1
    Uninstalling dill-0.3.5.1:
      Successfully uninstalled dill-0.3.5.1
Successfully installed dill-0.3.7
Collecting dill==0.3.5.1
  Using cached dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
Installing collected packages: dill
  Attempting uninstall: dill
    Found existing installation: dill 0.3.7
    Uninstalling dill-0.3.7:
      Successfully uninstalled dill-0.3.7
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
multiprocess 0.70.15 requires dill>=0.3.7, but you have dill 0.3.5.1 which is incompatible.[0m[31m
[0mSuccessfully installed dill-0.3.5.1


In [2]:
import torch
import json
import torch.nn as nn
import nlp
import pandas as pd
import numpy as np
import transformers
import evaluate
import dataclasses

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import warnings
warnings.filterwarnings("ignore")

### Downloading the Model

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [4]:
# Save the model's parameters to a file
model_id = "model_large"                             # give your model name to avoid confusion

model_path = model_id
torch.save(model.state_dict(), model_path)

# Check the size of the saved model file
import os
model_file_size_bytes = os.path.getsize(model_path)
model_file_size_mb = model_file_size_bytes / (1024 * 1024)  # Convert to MB
model_file_size_gb = model_file_size_mb / 1024  # Convert to GB

print(f"Model Size: {model_file_size_gb:.4f} GB")

Model Size: 0.9224 GB


### Preprocessing the dataset (SQUAD)

In [5]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=16)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [6]:
# load train and validation split of squad
train_dataset  = nlp.load_dataset('squad', split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset('squad', split=nlp.Split.VALIDATION)

train_dataset = train_dataset.select(range(30000))
valid_dataset = valid_dataset.select(range(30000))

# map add_eos_to_examples function to the dataset example wise
train_dataset = train_dataset.map(add_eos_to_examples)
# map convert_to_features batch wise
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/10570 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [7]:
len(train_dataset), len(valid_dataset)

(30000, 10570)

In [8]:
@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """

    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': lm_labels,
        'decoder_attention_mask': decoder_attention_mask
    }

### Training the model

In [9]:
!pip install rouge_score
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


# Define training args
training_args = Seq2SeqTrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=3e-4,
        fp16=False,
        logging_steps=100,
        # evaluation_strategy="steps",
        # save_strategy="steps",
        # eval_steps=200,
        # save_steps=200,
        output_dir="t5_base_finetune_v1.0",
        push_to_hub=True,

)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),

)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!



/content/t5_base_finetune_v1.0 is already a clone of https://huggingface.co/vimal52/t5_base_finetune_v1.0. Make sure you pull the latest changes with `repo.git_pull()`.


In [12]:
trainer.train().metrics

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,0.1206
200,0.0097
300,0.01
400,0.0094
500,0.0082
600,0.0109
700,0.0098
800,0.0061
900,0.0064
1000,0.0078


Step,Training Loss
100,0.1206
200,0.0097
300,0.01
400,0.0094
500,0.0082
600,0.0109
700,0.0098
800,0.0061
900,0.0064
1000,0.0078


{'train_runtime': 11554.4908,
 'train_samples_per_second': 2.596,
 'train_steps_per_second': 0.649,
 'total_flos': 2.054272057344e+16,
 'train_loss': 0.005430561415354411,
 'epoch': 1.0}

In [13]:
HUGGING_FACE_USER_NAME = "vimal52"
model_name = "t5_base_finetune_v1.0"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vimal52/t5_base_finetune_v1.0/commit/0be46d942156d665a063873dc5c02b45fd3de186', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='0be46d942156d665a063873dc5c02b45fd3de186', pr_url=None, pr_revision=None, pr_num=None)

### Evalution

In [None]:
DEVICE = "cuda:0"
Q_LEN = 256
def get_answer(question, context):
  # input_text = "question: %s  context: %s" % (question, context)
  # inputs = tokenizer([input_text])

  inputs = tokenizer(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)

  input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
  attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

  outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)


  return tokenizer.decode(outputs[0], skip_special_tokens=True)

context = "Manuel have created RuPERTa-base with the support of HF-Transformers and Google"
question = "Who has supported Manuel?"

get_answer(question, context)

In [15]:
questions = ["What is the Invoice number?",
             "What is the Invoice date?",
             "What is the total amount?",
             "State the From address of Invoice?",
             "State the To address of Invoice?",
             "What is the order number?"
             ]

context = "[Document(page_content='Invoice\n\nInvoice Number\n\nINV-3337\n\nFrom: DEMO - Sliced Invoices Suite 5A-1204 123 Somewhere Street Your City AZ 12345 admin@slicedinvoices.com\n\nOrder Number\n\n12345\n\nInvoice Date\n\nJanuary 25, 2016\n\nDue Date\n\nJanuary 31, 2016\n\nTotal Due\n\n$93.50\n\nTo: Test Business 123 Somewhere St Melbourne, VIC 3000 test@test.com\n\nP aid\n\nHrs/Qty\n\nService\n\nRate/Price\n\nAdjust\n\nSub Total\n\nWeb Design This is a sample description...\n\n1.00\n\n$85.00\n\n0.00%\n\n$85.00\n\nSub Total\n\n$85.00\n\nTax\n\n$8.50\n\nTotal\n\n$93.50\n\nANZ Bank ACC # 1234 1234 BSB # 4321 432\n\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month. Thanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com Page 1/1', metadata={'source': 'invoice.pdf'})]"
for qn in questions:
  print(get_answer(qn,context))


question'Invoice Invoice Number INV-3337 From: DEMO - Slice
question_content='Invoice Invoice Number INV-3337 From: DEMO
question='Invoice Invoice Number INV-3337 From: DEMO -
question/content='Invoice Invoice Number INV-3337 From: DEMO
question/content='Invoice Invoice Number INV-3337 From: DEMO
question_content='Invoice Invoice Number INV-3337 From: DEMO


In [16]:
questions = ["What is the capital of France?",
             "How many days are there I a week?",
             "What is the largest planet in our solar system?",
             "What is the tallest mountain in the world?",
             "What is the main language in India?",
             "Who is the author of Harry Potter book series?"
             ]

context = " "
for qn in questions:
  print(get_answer(qn,context))

question: What is the capital of France? 
question: How many days are there I a week? 
question: What is the largest planet in our solar system? 
question: What is the tallest mountain in the world? 
question: What is the main language in India? 
question: Who is the author of Harry Potter book series? 


In [17]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [18]:
!pip install pynvml
import psutil
import pynvml


# RAM Usage
ram = psutil.virtual_memory()
ram_usage = ram.used / (1024 * 1024 * 1024)
print("RAM Usage:", ram_usage, "GB")
print()

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:',(round(torch.cuda.memory_allocated(0)/1024**3,1) + (round(torch.cuda.memory_cached(0)/1024**3,1))), 'GB')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
print()

# Disk Usage
disk_usage = psutil.disk_usage('/')
print("Disk Usage:", (disk_usage.used / (1024 * 1024 * 1024)-24.2), "GB")


Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m30.7/53.1 kB[0m [31m734.9 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m884.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0
RAM Usage: 3.6988449096679688 GB

Using device: cuda
Tesla T4
Memory Usage: 9.2 GB
Allocated: 2.8 GB
Cached:    6.4 GB

Disk Usage: 47.464291381835935 GB
