In [None]:
%%capture
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr accelerate evaluate --upgrade

In [None]:
%%capture
!sudo apt-get install git-lfs --yes

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
dataset = load_dataset("json", data_files="./dollyFinal.json")

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-df2e7f125c520bd9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-df2e7f125c520bd9/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['data', 'response'],
        num_rows: 3279
    })
})

In [None]:
# train_indices, test_indices = train_test_split(range(len(dataset['train'])), test_size=0.1, random_state=42)

# train_ds = dataset['train'].select(train_indices).map(lambda example: {k: example[k] for k in desired_columns})
# test_ds = dataset['train'].select(test_indices).map(lambda example: {k: example[k] for k in desired_columns})

# dataset['train'] = train_ds
# dataset['test'] = test_ds

In [None]:
# from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = dataset["train"].map(lambda x: tokenizer(x["data"], truncation=True), batched=True, remove_columns=["data", "response"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset["train"].map(lambda x: tokenizer(x["response"], truncation=True), batched=True, remove_columns=["data", "response"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/3279 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/3279 [00:00<?, ? examples/s]

Max target length: 512


In [None]:
dataset['train'][0]

{'data': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney. When did Virgin Australia start operating?",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.'}

In [None]:
def preprocess_function(sample,padding="max_length"):
    # # add prefix to the input for t5
    inputs = sample["data"]

    # # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["response"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["data", "response"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/3279 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

model_name = "8_8_False_5e-5_5_500"
drive_path ="/content/drive/MyDrive/Sprinklr_Internship/Models/{model_name}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=drive_path,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    fp16=True, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=2,
    # logging & evaluation strategies
    logging_dir=f"{drive_path}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],
    compute_metrics=compute_metrics,
)

In [None]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,27.0331,15.3272,26.1498,26.2798,8.369015
2,0.0,,27.0331,15.3272,26.1498,26.2798,8.369015


TrainOutput(global_step=3280, training_loss=0.0, metrics={'train_runtime': 3179.7721, 'train_samples_per_second': 2.062, 'train_steps_per_second': 1.032, 'total_flos': 4490638717353984.0, 'train_loss': 0.0, 'epoch': 2.0})

In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")

Your max_length is set to 200, but your input_length is only 191. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


dialogue: 
Nathalie: have you thought about the holiday?
Pauline: me & tony are into greece really
Jacob: anywhere warm and sunny. greece cool
Anthony: greece is warm sunny and cheapish
Nathalie: i guess cob we ok w that
Jacob: sure thing
Pauline: so august as we said?
Jacob: thats the thing. we need to be back by aug 10
Anthony: what?? why??
Nathalie: sis wedding
Pauline: your lil sis getting married?!? lol
Jacob: she's not little. seen her tony?
Anthony: worth a look?
Nathalie: shut up assholes. shes my sister for fucks sake
Pauline: idiots
Jacob: come one just kidding. we love you
Anthony: we have no choice XD
---------------
flan-t5-base summary:
Nathalie, Pauline, Anthony and Anthony are going to Greece for a holiday in August. They need to be back by August 10 because of their sister's wedding.


### GitHub Code

In [1]:
!git config --global user.email "tusharbokade003@gmail.com"
!git config --global user.name "Tushar Bokade"

In [2]:
token = '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'
username = 'suspense1441'
repo = 'Sprinklr'

In [3]:
!git clone https://{token}@github.com/{username}/{repo}

Cloning into 'Sprinklr'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 28 (delta 12), reused 11 (delta 4), pack-reused 0[K
Unpacking objects: 100% (28/28), 1.92 MiB | 1.62 MiB/s, done.


In [4]:
%cd 'Sprinklr'

/content/Sprinklr


In [None]:
!ls

26May.ipynb  AbstractiveQA.ipynb  dolly.json		 drqa.ipynb
29May.ipynb  dollyFinal.json	  DollyPreprocess.ipynb  FLAN-T5-BASE.ipynb


In [None]:
%cd ..

/content


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd ..

/content


In [None]:
!mv /content/drive/MyDrive/Colab /content/Sprinklr/FLAN-T5-BASE.ipynb

mv: cannot stat '/drive/MyDrive/Colab': No such file or directory


In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [None]:
!git add --all

In [None]:
!git commit -a -m "Rouge"

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [None]:
!git remote -v

In [None]:
!git push origin main

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!mv /content/drive/MyDrive/Colab Notebooks/FLAN-T5-BASE.ipynb /content/Sprinklr