In [2]:
!pip install transformers
!pip install evaluate
!pip install rouge
!pip install langchain
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install -U nlp
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install rouge_score
!pip install evaluate
!pip install dill==0.3.5.1

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.2 tokenizers-0.13.3 transformers-4.31.0
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━

In [3]:
import torch
import json
import torch.nn as nn
import nlp
import pandas as pd
import numpy as np
import transformers
import evaluate
import dataclasses

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import warnings
warnings.filterwarnings("ignore")

In [4]:

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",
                                               quantization_config=bnb_config,
                                               device_map="auto")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Save the model's parameters to a file
model_id = "model_large"                             # give your model name to avoid confusion

model_path = model_id
torch.save(model.state_dict(), model_path)

# Check the size of the saved model file
import os
model_file_size_bytes = os.path.getsize(model_path)
model_file_size_mb = model_file_size_bytes / (1024 * 1024)  # Convert to MB
model_file_size_gb = model_file_size_mb / 1024  # Convert to GB

print(f"Model Size: {model_file_size_gb:.4f} GB")

Model Size: 0.3075 GB


### Data Pre processing

In [7]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=512)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [8]:
# load train and validation split of squad
train_dataset  = nlp.load_dataset('squad', split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset('squad', split=nlp.Split.VALIDATION)

train_dataset = train_dataset.select(range(30000))
valid_dataset = valid_dataset.select(range(30000))

# map add_eos_to_examples function to the dataset example wise
train_dataset = train_dataset.map(add_eos_to_examples)
# map convert_to_features batch wise
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Downloading:   0%|          | 0.00/5.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown sizetotal: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/408a8fa46a1e2805445b793f1022e743428ca739a34809fce872f0c7f17b44ab...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/408a8fa46a1e2805445b793f1022e743428ca739a34809fce872f0c7f17b44ab. Subsequent calls will reuse this data.


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30000 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [9]:
len(train_dataset), len(valid_dataset)

(30000, 10570)

In [10]:
@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """

    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': lm_labels,
        'decoder_attention_mask': decoder_attention_mask
    }

### QLoRa

In [11]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

model.gradient_checkpointing_enable()
# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

# Define LoRA Config
lora_config = LoraConfig(r=8,
                        lora_alpha=64,
                        target_modules=["q", "v"],
                        # lora_dropout=0.05,
                        bias="none",
                        task_type="SEQ_2_SEQ_LM"
)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3560841867092814


### Model Training

In [13]:

import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [15]:
# !pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


# Define training args
training_args = Seq2SeqTrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=1,
        learning_rate=3e-4,
        fp16=False,
        logging_steps=100,
        # evaluation_strategy="steps",
        # save_strategy="steps",
        # eval_steps=200,
        # save_steps=200,
        output_dir="t5_base_finetune_QLoRa_v3.0",
        push_to_hub=True,

)
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),

)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


Cloning https://huggingface.co/vimal52/t5_base_finetune_QLoRa_v3.0 into local empty directory.


In [17]:
trainer.train().metrics

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,9.3763
200,6.3797
300,6.186
400,6.1249
500,6.0367
600,5.9898
700,5.968
800,5.9046
900,5.8963
1000,5.8629


{'train_runtime': 10424.5691,
 'train_samples_per_second': 2.878,
 'train_steps_per_second': 0.18,
 'total_flos': 1.323154538496e+16,
 'train_loss': 6.097485416666666,
 'epoch': 1.0}

In [18]:
HUGGING_FACE_USER_NAME = "vimal52"
model_name = "t5_base_finetune_QLoRa_v3.0"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vimal52/t5_base_finetune_QLoRa_v3.0/commit/5cb0430e429ed074b087e832fb8c465798373e9a', commit_message='Upload model', commit_description='', oid='5cb0430e429ed074b087e832fb8c465798373e9a', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
def get_answer(question, context):
  input_text = "question: %s  context: %s" % (question, context)
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'],
               attention_mask=features['attention_mask'])

  return tokenizer.decode(output[0], skip_special_tokens=True)

context = "Manuel have created RuPERTa-base with the support of HF-Transformers and Google"
question = "Who has supported Manuel?"

get_answer(question, context)

': context context:: Manuel: context context:  Ru: Ru:: of the'

In [20]:
questions = ["What is the Invoice number?",
             "What is the Invoice date?",
             "What is the total amount?",
             "State the From address of Invoice?",
             "State the To address of Invoice?",
             "What is the order number?"
             ]

context = "[Document(page_content='Invoice\n\nInvoice Number\n\nINV-3337\n\nFrom: DEMO - Sliced Invoices Suite 5A-1204 123 Somewhere Street Your City AZ 12345 admin@slicedinvoices.com\n\nOrder Number\n\n12345\n\nInvoice Date\n\nJanuary 25, 2016\n\nDue Date\n\nJanuary 31, 2016\n\nTotal Due\n\n$93.50\n\nTo: Test Business 123 Somewhere St Melbourne, VIC 3000 test@test.com\n\nP aid\n\nHrs/Qty\n\nService\n\nRate/Price\n\nAdjust\n\nSub Total\n\nWeb Design This is a sample description...\n\n1.00\n\n$85.00\n\n0.00%\n\n$85.00\n\nSub Total\n\n$85.00\n\nTax\n\n$8.50\n\nTotal\n\n$93.50\n\nANZ Bank ACC # 1234 1234 BSB # 4321 432\n\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month. Thanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com Page 1/1', metadata={'source': 'invoice.pdf'})]"
for qn in questions:
  print(get_answer(qn, context))


12:: In::Do: context:: In:: In_::_
Payment the In: context::'Docucument'= In' In' In
: the amount: context context: In: context: context: In In: In context
i sentence of context context: context: context context: In' In context:' In
 the In context address: context:: context: context context: context:':'
' context: context:: context: In: In' In In: In: In


In [21]:
questions = ["What is the capital of France?",
             "How many days are there I a week?",
             "What is the largest planet in our solar system?",
             "What is the tallest mountain in the world?",
             "What is the main language in India?",
             "Who is the author of Harry Potter book series?"
             ]

context = ""
for qn in questions:
  print(get_answer(qn,context))

 is the capital of the capital of? of the is? is the capital:: context
 How many How How many days are days What I? How How How How How How How
planetary the largest planet in the largest planet in the solar: the: What What the What
The tallest the mountain in the tall the tallest: mountain the tall is tall? tall
context in is in in the context: What main is in the context: What is is the
context the author the author of the the is the the author Who is the: context the:


In [22]:
!pip install pynvml
import psutil
import pynvml


# RAM Usage
ram = psutil.virtual_memory()
ram_usage = ram.used / (1024 * 1024 * 1024)
print("RAM Usage:", ram_usage, "GB")
print()

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:',(round(torch.cuda.memory_allocated(0)/1024**3,1) + (round(torch.cuda.memory_cached(0)/1024**3,1))), 'GB')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
print()

# Disk Usage
disk_usage = psutil.disk_usage('/')
print("Disk Usage:", (disk_usage.used / (1024 * 1024 * 1024)-24.2), "GB")

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0
RAM Usage: 4.111698150634766 GB

Using device: cuda
Tesla T4
Memory Usage: 2.4 GB
Allocated: 0.4 GB
Cached:    2.0 GB

Disk Usage: 2.9147804260253913 GB
