### Install Dependencies

In [1]:
!pip install transformers
!pip install evaluate
!pip install rouge
!pip install langchain
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install -U nlp
!pip install -q -U git+https://github.com/huggingface/peft.git

!pip install evaluate
!pip install dill==0.3.5.1

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.2 MB/s[0m eta [36m0:00:0

In [2]:
import torch
import json
import torch.nn as nn
import nlp
import pandas as pd
import numpy as np
import transformers
import evaluate
import dataclasses

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import warnings
warnings.filterwarnings("ignore")

### Downloading the Model

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",
                                               load_in_8bit=True,
                                               device_map="auto")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Preprocessing the dataset (SQUAD)

In [4]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=512)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [5]:
# load train and validation split of squad
train_dataset  = nlp.load_dataset('squad', split=nlp.Split.TRAIN)
valid_dataset = nlp.load_dataset('squad', split=nlp.Split.VALIDATION)

# map add_eos_to_examples function to the dataset example wise
train_dataset = train_dataset.map(add_eos_to_examples)
# map convert_to_features batch wise
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Downloading:   0%|          | 0.00/5.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown sizetotal: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/408a8fa46a1e2805445b793f1022e743428ca739a34809fce872f0c7f17b44ab...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/408a8fa46a1e2805445b793f1022e743428ca739a34809fce872f0c7f17b44ab. Subsequent calls will reuse this data.


  0%|          | 0/87599 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [6]:
len(train_dataset), len(valid_dataset)

(87599, 10570)

In [7]:
@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """

    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': lm_labels,
        'decoder_attention_mask': decoder_attention_mask
    }

### LORA

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(r=8,
                        lora_alpha=32,
                        #  target_modules=["q", "v"],
                        lora_dropout=0.05,
                        bias="none",
                        task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3560841867092814


In [10]:
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    # return accuracy.compute(predictions=pred, references=labels)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
import transformers

# needed for gpt-neo-x tokenizer
# tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        # auto_find_batch_size=True,
        num_train_epochs=10,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [18]:
trainer.train()

Step,Training Loss
10,11.8927
20,11.7438
30,12.2321
40,12.1982
50,12.2769
60,11.9631
70,12.3634
80,11.9952
90,12.4635
100,12.3015


TrainOutput(global_step=100, training_loss=12.143039169311523, metrics={'train_runtime': 257.5303, 'train_samples_per_second': 1.553, 'train_steps_per_second': 0.388, 'total_flos': 274990104576000.0, 'train_loss': 12.143039169311523, 'epoch': 0.0})

In [19]:
def get_answer(question, context):
  input_text = "question: %s  context: %s" % (question, context)
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'],
               attention_mask=features['attention_mask'])

  return tokenizer.decode(output[0])

context = "Manuel have created RuPERTa-base with the support of HF-Transformers and Google"
question = "Who has supported Manuel?"

get_answer(question, context)

'<pad> HF-Transformers and Google</s>'

In [20]:
questions = ["What is the Invoice number?",
             "What is the Invoice date?",
             "What is the total amount?",
             "State the From address of Invoice?",
             "State the To address of Invoice?",
             "What is the order number?"
             ]

context = "[Document(page_content='Invoice\n\nInvoice Number\n\nINV-3337\n\nFrom: DEMO - Sliced Invoices Suite 5A-1204 123 Somewhere Street Your City AZ 12345 admin@slicedinvoices.com\n\nOrder Number\n\n12345\n\nInvoice Date\n\nJanuary 25, 2016\n\nDue Date\n\nJanuary 31, 2016\n\nTotal Due\n\n$93.50\n\nTo: Test Business 123 Somewhere St Melbourne, VIC 3000 test@test.com\n\nP aid\n\nHrs/Qty\n\nService\n\nRate/Price\n\nAdjust\n\nSub Total\n\nWeb Design This is a sample description...\n\n1.00\n\n$85.00\n\n0.00%\n\n$85.00\n\nSub Total\n\n$85.00\n\nTax\n\n$8.50\n\nTotal\n\n$93.50\n\nANZ Bank ACC # 1234 1234 BSB # 4321 432\n\nPayment is due within 30 days from date of invoice. Late payment is subject to fees of 5% per month. Thanks for choosing DEMO - Sliced Invoices | admin@slicedinvoices.com Page 1/1', metadata={'source': 'invoice.pdf'})]"
for qn in questions:
  print(get_answer(context,qn))


<pad> INV-3337</s>
<pad> January 25, 2016</s>
<pad> $93.50</s>
<pad> 12345</s>
<pad> 12345</s>
<pad> 12345</s>


In [21]:
questions = ["What is the capital of France?",
             "How many days are there I a week?",
             "What is the largest planet in our solar system?",
             "What is the tallest mountain in the world?",
             "What is the main language in India?",
             "Who is the author of Harry Potter book series?"
             ]

context = ""
for qn in questions:
  print(get_answer(context,qn))

<pad> savoie</s>
<pad> 6</s>
<pad> venus</s>
<pad> saab</s>
<pad> Hindi</s>
<pad> Harry Potter</s>
