In [1]:
!pip install pytesseract transformers[torch] datasets rouge-score nltk py7zr --upgrade #tensorboard
!pip install accelerate -U
!pip install bert_score
!pip install evaluate

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting transformers[torch]
  Downloading transformers-4.36.0-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr
  Downloading py7zr-0.20.8-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/2

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
dataset_id = "samsum"
dataset = load_dataset(dataset_id)
model_id="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [4]:
import json
from datasets import concatenate_datasets
from datasets.dataset_dict import DatasetDict

ds = DatasetDict()

# preprocess
def update_dataset(example, index):
    example["dialogue"] = f"Summarize the following dialogue. \n" + dataset["train"][index]["dialogue"]
    return example

ds["train"] = dataset["train"].map(update_dataset, with_indices=True)

def update_dataset(example, index):
    example["dialogue"] = f"Summarize the following dialogue. \n" + dataset["test"][index]["dialogue"]
    return example

ds["test"] = dataset["test"].map(update_dataset, with_indices=True)


tokenized_inputs = concatenate_datasets([ds["train"], ds["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])

tokenized_targets = concatenate_datasets([ds["train"], ds["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

def preprocess_function(sample,padding="max_length"):

    # the instructions can be rewritten based on need

    # NOTE I tried the <I></I> and <sep></sep>, not much difference

    # and towards the topic of {sample['topic']}.

    # the raw instruction
    # inputs = ["Summarize the following dialogue. \n" + item for item in sample["dialogue"]]

    # The topic and full instruction (added previously)
    inputs = [item for item in sample["dialogue"]]

    # The full instruction (topic added previously) old version
    # inputs = [f"Summarize the following dialogue with helping information after each utterance (between the special separator) and" + item for item in sample["dialogue"]]

    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# the problem is here
tokenized_dataset = ds.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
# tokenized_dataset["train"] = updated_train.map(preprocess_function, with_indices=True, batched=True,remove_columns=["dialogue", "summary", "id"])
# tokenized_dataset["test"] = updated_test.map(preprocess_function, with_indices=True, batched=True,remove_columns=["dialogue", "summary", "id"])


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [5]:
ds["test"]["dialogue"][0]

"Summarize the following dialogue. \nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"

In [6]:
tokenized_dataset["test"]["input_ids"][0] == tokenized_dataset["test"]["input_ids"][1]

False

In [7]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [8]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [10]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False,
    learning_rate=5e-5,
    num_train_epochs=2,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.4677,1.387752,46.4152,22.8374,38.7482,42.7404,17.384615
2,1.3813,1.383245,46.6159,22.9825,39.069,42.8873,17.384615


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1842, training_loss=1.41731051798105, metrics={'train_runtime': 1151.6711, 'train_samples_per_second': 25.584, 'train_steps_per_second': 1.599, 'total_flos': 2.017569063252787e+16, 'train_loss': 1.41731051798105, 'epoch': 2.0})

In [None]:
# trainer.evaluate()

In [None]:
# {'eval_loss': 1.065560221672058,
#  'eval_rouge1': 41.055,
#  'eval_rouge2': 18.0998,
#  'eval_rougeL': 35.6308,
#  'eval_rougeLsum': 37.4082,
#  'eval_gen_len': 18.845333333333333,
#  'eval_runtime': 373.4175,
#  'eval_samples_per_second': 4.017,
#  'eval_steps_per_second': 1.004,
#  'epoch': 2.0}

In [12]:
def my_tokenize(examples):
    return tokenizer(examples['dialogue'], padding=True, truncation=True, max_length=512)
td = ds["test"].map(my_tokenize, batched=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [13]:
ps = trainer.predict(td, max_length = 100)

In [14]:
import numpy as np
predictions = np.where(ps.predictions != -100, ps.predictions, tokenizer.pad_token_id)
predictions = [tokenizer.decode(ids, skip_special_tokens=True, max_length = 100) for ids in predictions]
references = [s["summary"] for s in dataset["test"]]

In [15]:
print(predictions[0])
print(predictions[1])
print(predictions[2])
print(predictions[10])
print(predictions[11])
print(predictions[12])

Amanda can't find Betty's number. Amanda will ask Larry. He called Betty last time they were at the park together. Hannah doesn't know Larry well. Amanda will text Larry.
Eric and Rob are watching a stand-up of Eric's. Eric and Rob will watch some of his stand-ups on youtube.
Lenny wants to buy two pairs of purple trousers. Bob recommends the first pair or the third pair.
Wanda and Gina are going to make a party on Friday.
Martin won two cinema tickets online. He wrote a short review on Facebook and will see the new film with Redford till the end of the week.
Charlee is in class. She is preparing a performance in Portuguese. The writer is Mroek.


In [16]:
from rouge_score import rouge_scorer, scoring

def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()

    final_scores = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return final_scores

rouge_scores = compute_rouge(predictions, references)
print(rouge_scores)


{'rouge1': 49.70866457051946, 'rouge2': 24.80475757667293, 'rougeL': 41.053256049299115}


In [17]:
# from evaluate import load
# bertscore = load("bertscore")

# results = bertscore.compute(predictions=predictions, references=references, lang="en")

In [18]:
# predictions
with open("flan_t5_base_samsum_raw.txt", 'w', encoding='utf-8') as f:
    for string in predictions:
        f.write(string + '\n')

from google.colab import files
files.download('flan_t5_base_samsum_raw.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>