In [3]:
%%capture
!pip install transformers datasets nltk tensorboard py7zr accelerate evaluate --upgrade

In [None]:
%%capture
!sudo apt-get install git-lfs --yes

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-base"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [6]:
dataset = load_dataset("json", data_files="//kaggle/input/combined-data/data.json")

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-7f10e0875a17ad20/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7f10e0875a17ad20/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['response', 'data'],
        num_rows: 9207
    })
})

In [8]:
# train_indices, test_indices = train_test_split(range(len(dataset['train'])), test_size=0.1, random_state=42)

# train_ds = dataset['train'].select(train_indices).map(lambda example: {k: example[k] for k in desired_columns})
# test_ds = dataset['train'].select(test_indices).map(lambda example: {k: example[k] for k in desired_columns})

# dataset['train'] = train_ds
# dataset['test'] = test_ds

In [9]:
# from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = dataset["train"].map(lambda x: tokenizer(x["data"], truncation=True), batched=True, remove_columns=["data", "response"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = dataset["train"].map(lambda x: tokenizer(x["response"], truncation=True), batched=True, remove_columns=["data", "response"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/9207 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/9207 [00:00<?, ? examples/s]

Max target length: 512


In [10]:
dataset['train'][0]

{'response': 'France',
 'data': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. In what country is Normandy located?'}

In [11]:
def preprocess_function(sample,padding="max_length"):
    # # add prefix to the input for t5
    inputs = sample["data"]

    # # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["response"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["data", "response"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/9207 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [12]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.translate.bleu_score import corpus_bleu
nltk.download("punkt")

# Metric
metric = evaluate.load("bleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(pred):
#     print("YO")
#     print(f"pred: {pred}")
    # references = pred.label_ids
    # predictions = pred.predictions.argmax(-1)
    predictions, references = pred
#     print(f"references: {references}")
#     print(f"predictions: {predictions}")

    # Some simple post-processing
    # predictions, references = postprocess_text(predictions, references)
    
    # Convert the token indices back to strings
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     print(f"decoded_predictions: {decoded_predictions}")
    references = np.where(references != -100, references, tokenizer.pad_token_id)
    decoded_references = tokenizer.batch_decode(references, skip_special_tokens=True)
#     print(f"decoded_references: {decoded_references}")

    decoded_predictions, decoded_references = postprocess_text(decoded_predictions, decoded_references)
    
    # # Calculate BLEU score
    # bleu = corpus_bleu([[ref] for ref in references], predictions)
    
    # return {"bleu": bleu}
    # predictions = [tokenizer.decode(pred) for pred in predictions]
    # references = [[tokenizer.decode(ref)] for ref in references]

    
    

    # Compute BLEU score
    bleu_score = corpus_bleu(decoded_references, decoded_predictions)

    return {'bleu_score': bleu_score}

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [15]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [18]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

# Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# model_name = "1_1_T_T_5e-5_2_500_D"
# drive_path =f"/content/drive/MyDrive/Sprinklr_Internship/Models/{model_name}"
model_name = "Base_5e-3_5_MB"
drive_path = "model/"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=drive_path,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=12,
    predict_with_generate=True,
    fp16=True, # Overflows with fp16
    learning_rate=5e-3,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"{drive_path}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=0,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics,
)

In [None]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)

# select a random test sample
sample = dataset['test'][randrange(len(dataset["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")

Your max_length is set to 200, but your input_length is only 191. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


dialogue: 

Nathalie: have you thought about the holiday?

Pauline: me & tony are into greece really

Jacob: anywhere warm and sunny. greece cool

Anthony: greece is warm sunny and cheapish

Nathalie: i guess cob we ok w that

Jacob: sure thing

Pauline: so august as we said?

Jacob: thats the thing. we need to be back by aug 10

Anthony: what?? why??

Nathalie: sis wedding

Pauline: your lil sis getting married?!? lol

Jacob: she's not little. seen her tony?

Anthony: worth a look?

Nathalie: shut up assholes. shes my sister for fucks sake

Pauline: idiots

Jacob: come one just kidding. we love you

Anthony: we have no choice XD

---------------

flan-t5-base summary:

Nathalie, Pauline, Anthony and Anthony are going to Greece for a holiday in August. They need to be back by August 10 because of their sister's wedding.


### GitHub Code

In [3]:
!git config --global user.email "tusharbokade003@gmail.com"
!git config --global user.name "Tushar Bokade"

In [4]:
token = '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'
username = 'suspense1441'
repo = 'Sprinklr'

In [5]:
!git clone https://{token}@github.com/{username}/{repo}

Cloning into 'Sprinklr'...

remote: Enumerating objects: 31, done.[K

remote: Counting objects: 100% (31/31), done.[K

remote: Compressing objects: 100% (26/26), done.[K

remote: Total 31 (delta 13), reused 13 (delta 4), pack-reused 0[K

Unpacking objects: 100% (31/31), 1.93 MiB | 2.28 MiB/s, done.


In [6]:
%cd 'Sprinklr'

/content/Sprinklr


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
%cd ..

/content


In [10]:
!cp /content/drive/MyDrive/Sprinklr_Internship/Colab\ Files/3.ipynb /content/Sprinklr/FLAN-T5-BASE.ipynb

cp: cannot stat '/content/drive/MyDrive/Sprinklr_Internship/Colab Files/3.ipynb': No such file or directory


In [None]:
!git status

On branch main

Your branch is up to date with 'origin/main'.



nothing to commit, working tree clean


In [None]:
!git add --all

In [None]:
!git commit -a -m "Rouge"

On branch main

Your branch is up to date with 'origin/main'.



nothing to commit, working tree clean


In [None]:
!git remote -v

In [None]:
!git push origin main

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
!mv /content/drive/MyDrive/Colab\ Notebooks/3.ipynb /content/drive/MyDrive/Sprinklr_Internship/Colab\ Files

In [None]:
function ClickConnect(){
console.log("Working"); 
document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click();
}
timerId = setInterval(ClickConnect,1000000000)
clearInterval(timerId);