# Mount Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installation and import libraries

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.0 MB/s[0m et

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [None]:
!nvidia-smi

Wed Oct  4 12:47:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, TrainingArguments, Trainer

import matplotlib.pyplot as plt
from datasets import load_dataset, load_metric
import pandas as pd
import torch

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Check GPU

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# Load Dataset

In [None]:
dataset_samsum = load_dataset("samsum")
dataset_samsum

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
split_length = [len(dataset_samsum[split])for split in dataset_samsum]
split_length

[14732, 819, 818]

In [None]:
print(f"Features: {dataset_samsum['train'].column_names}")

Features: ['id', 'dialogue', 'summary']


# Load pegasus-cnn_dailymail Model from Hugging face

In [None]:
model_name = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

# Check default Dialogue and Summary Without Training

In [None]:
print("\nDialogue:")
print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])


Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [None]:
dialogue = dataset_samsum['test'][0]['dialogue']
dialogue

"Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"

In [None]:
pipe = pipeline('summarization', model = model_name)
pipe_out = pipe(dialogue)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


In [None]:
print(pipe_out[0]['summary_text'].replace(".<n>", ".\n "))

Amanda: Ask Larry Amanda: He called her last time we were at the park together .
 Hannah: I'd rather you texted him .
 Amanda: Just text him .


# ROUGE score before train

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(dataset_samsum['test'], rouge_metric, pegasus_model, tokenizer, column_text = 'dialogue', column_summary='summary', batch_size=8)



  rouge_metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

100%|██████████| 103/103 [19:02<00:00, 11.09s/it]


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.0155,0.0003,0.015451,0.015475


# **Model Train 1 - 3 epoch**

## Convert into numerical form

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
dataset_samsum_pt['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'input_ids': [12195,
  151,
  125,
  7091,
  3659,
  107,
  842,
  119,
  245,
  181,
  152,
  10508,
  151,
  7435,
  147,
  12195,
  151,
  125,
  131,
  267,
  650,
  119,
  3469,
  29344,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]}

## Training 3 epoch

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = pegasus_model)

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.34.0', '0.23.0')

In [None]:

print(torch.__version__)
print(torch.version.cuda)


2.0.1+cu118
11.8


In [None]:
torch.cuda.empty_cache()

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(output_dir="/content/drive/MyDrive/Custom_Pagasus/custom-pegasus-model",
                                   num_train_epochs=4, warmup_steps=500,per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                   weight_decay=0.01, logging_steps=10,evaluation_strategy='steps',eval_steps=500,
                                   save_steps=1e6,gradient_accumulation_steps=16, learning_rate=1e-4, report_to="tensorboard")


In [None]:
trainer = Trainer(model=pegasus_model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

## Tensor Board

In [1]:
%load_ext tensorboard

In [6]:
# %reload_ext tensorboard

In [7]:
%tensorboard --logdir /content/drive/MyDrive/Custom_Pagasus/custom-pegasus-model/runs

Reusing TensorBoard on port 6006 (pid 1737), started 0:01:11 ago. (Use '!kill 1737' to kill it.)

<IPython.core.display.Javascript object>

## Trail codes GPT

In [None]:
# # Visualize training history
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# import matplotlib.pyplot as plt
# import numpy as np

# pegasus_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# history = pegasus_model.fit(dataset_samsum_pt["train"]["dialogue"], dataset_samsum_pt["train"]["summary"], dataset_samsum_pt["validation"], epochs=7, batch_size=10, verbose=0  )

In [None]:
# # Initialize empty lists to store loss values
# train_losses = []
# val_losses = []

# # Define the training loop
# def compute_loss(input_ids, attention_mask, labels):
#     # Forward pass and compute loss (modify this as needed for your specific dataset)
#     outputs = pegasus_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#     loss = outputs.loss
#     return loss.item()

# # Training loop with loss collection
# for epoch in range(training_args.num_train_epochs):
#     for batch in trainer.get_train_dataloader():
#         # Ensure that each batch contains input_ids, attention_mask, and labels
#         input_ids = batch["input_ids"]
#         attention_mask = batch["attention_mask"]
#         labels = batch["labels"]

#         train_loss = compute_loss(input_ids, attention_mask, labels)
#         train_losses.append(train_loss)

#     # Validation loop with loss collection
#     for batch in trainer.get_eval_dataloader():
#         # Ensure that each batch contains input_ids, attention_mask, and labels
#         input_ids = batch["input_ids"]
#         attention_mask = batch["attention_mask"]
#         labels = batch["labels"]

#         val_loss = compute_loss(input_ids, attention_mask, labels)
#         val_losses.append(val_loss)


You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: ignored

## Train

In [None]:
# Train
history = trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.593,1.44716
1000,1.4878,1.378822
1500,1.2584,1.35186
2000,1.3066,1.342812
2500,1.2455,1.334174
3000,1.1666,1.339762
3500,1.1243,1.335814


In [None]:
history


TrainOutput(global_step=3680, training_loss=1.379250509324281, metrics={'train_runtime': 11780.6071, 'train_samples_per_second': 5.002, 'train_steps_per_second': 0.312, 'total_flos': 2.210989387977523e+16, 'train_loss': 1.379250509324281, 'epoch': 4.0})

In [None]:
# Optionally, evaluate the model
results = trainer.evaluate()

## Error Train

In [None]:
# Train --- keep for error sample ---
# history = trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.593,1.44716
1000,1.4955,1.377701
1500,1.2522,1.353995
2000,1.2779,1.343041
2500,1.2389,1.33518
3000,1.1525,1.343702
3500,1.105,1.342037


FailedPreconditionError: ignored

## Check Machine conditions

In [None]:
!nvidia-smi


Wed Oct  4 16:06:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |  14877MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import psutil

# Get memory usage in bytes
memory_info = psutil.virtual_memory()

# Print memory usage information
print(f"Total Memory: {memory_info.total} bytes")
print(f"Available Memory: {memory_info.available} bytes")
print(f"Used Memory: {memory_info.used} bytes")
print(f"Memory Usage Percentage: {memory_info.percent}%\n\n")
print(f"CPU Cores: {psutil.cpu_count(logical=False)} (Physical Cores)")
print(f"CPU Threads: {psutil.cpu_count(logical=True)} (Logical Cores)")
print(f"CPU Percent: {psutil.cpu_percent(0.1)}%")


Total Memory: 13613260800 bytes
Available Memory: 6933676032 bytes
Used Memory: 6328504320 bytes
Memory Usage Percentage: 49.1%


CPU Cores: 1 (Physical Cores)
CPU Threads: 2 (Logical Cores)
CPU Percent: 52.6%


## Plot Graph

In [None]:
# Print the keys (column names) in history.metrics
print(history.metrics.keys())


dict_keys(['train_runtime', 'train_samples_per_second', 'train_steps_per_second', 'total_flos', 'train_loss', 'epoch'])


In [None]:
print(results.keys())


dict_keys(['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])


In [None]:
import matplotlib.pyplot as plt

# Extract the training and validation losses from the training history
train_losses = history.metrics["train_loss"]
eval_losses = results["eval_loss"]

# Extract the number of steps for training and evaluation
train_steps = list(range(0, len(train_losses) * training_args.logging_steps, training_args.logging_steps))
eval_steps = list(range(0, len(eval_losses) * training_args.eval_steps, training_args.eval_steps))

# Plot the training and validation loss curves
plt.figure(figsize=(16, 8))
plt.plot(train_steps, train_losses, label="Training Loss", marker='o', linestyle='-')
plt.plot(eval_steps, eval_losses, label="Validation Loss", marker='o', linestyle='-')

plt.title("Training and Validation Loss Curves")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.legend(['Train', 'Validation'], loc='upper left')
plt.grid(True)

# Show the plot
plt.show()


TypeError: ignored

## ROUGE score after train

In [None]:
rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(
    dataset_samsum['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary')

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )


##Save New Trained model

In [None]:
# Save model
trainer.save_model("/content/drive/MyDrive/Custom_Pagasus/custom-model")

In [None]:
# # Save tokenizer
# tokenizer.save_pretrained("/content/drive/MyDrive/Custom_Pagasus/custom-tokenizer")

## Plot graph GPT

In [None]:
# Create a figure and plot both curves on the same graph
plt.figure(figsize=(16, 10))  # Adjust the figure size as needed
epochs = range(1, len(train_losses) + 1)
plt.plot(epochs, train_losses, label='Training Loss', marker='o')
plt.plot(epochs, val_losses, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.grid(True)

# Show the plot
plt.show()

TypeError: ignored

<Figure size 1600x1000 with 0 Axes>

## Test New Trained model

In [3]:
%cd /content/drive/MyDrive/Custom_Pagasus

[Errno 2] No such file or directory: '/content/drive/MyDrive/Custom_Pagasus'
/content


In [None]:
pwd

'/content/drive/.shortcut-targets-by-id/1tcti9tUj6O_6zpXs7Zdx_DDLB6Xuk1hT/Custom_Pagasus'

In [None]:
ls

[0m[01;34mcustom-model[0m/  [01;34mcustom-pegasus-model[0m/


In [None]:
model_name = "custom-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
dataset_samsum = load_dataset("samsum")

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

pipe = pipeline("summarization", model=custom_model,tokenizer=tokenizer)

In [None]:
sample_text = dataset_samsum["test"][0]["dialogue"]
print("\nDialogue:")
sample_text


Dialogue:


"Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"

In [None]:
reference = dataset_samsum["test"][0]["summary"]
print("\nReference Summary:")
reference


Reference Summary:


"Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."

In [None]:
print("\nNew Trained Model Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)



New Trained Model Summary:
Amanda can't find Betty's number. Larry called her the last time they were at the park together. Hannah wants Amanda to text him instead.


# **Model Train 4 - 7 epoch**

## Load New Trained Model

In [None]:
%cd /content/drive/MyDrive/Samsum_Pagasus_New_Train

/content/drive/MyDrive/Samsum_Pagasus_New_Train


In [None]:
pwd

'/content/drive/MyDrive/Samsum_Pagasus_New_Train'

In [None]:
ls

[0m[01;34mnew-train-tokenizer[0m/  [01;34mpegasus-samsum[0m/  [01;34mpegasus-samsum-model-new-train[0m/


In [None]:
pegasus_model_train = AutoModelForSeq2SeqLM.from_pretrained("pegasus-samsum-model-new-train").to(device)
tokenizer = AutoTokenizer.from_pretrained("new-train-tokenizer")

## Convert to numerical form

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [None]:
dataset_samsum_pt['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'input_ids': [12195,
  151,
  125,
  7091,
  3659,
  107,
  842,
  119,
  245,
  181,
  152,
  10508,
  151,
  7435,
  147,
  12195,
  151,
  125,
  131,
  267,
  650,
  119,
  3469,
  29344,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]}

## Training 4 epoch

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = pegasus_model_train)

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.33.2', '0.23.0')

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, TrainingArguments, Trainer

trainer_args = TrainingArguments(output_dir='/content/drive/MyDrive/Samsum_Pagasus_New_Train/pegasus-samsum',
                                 num_train_epochs=4, warmup_steps=500,per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                 weight_decay=0.01, logging_steps=10,evaluation_strategy='steps',eval_steps=500,
                                 save_steps=1e6,gradient_accumulation_steps=16, learning_rate=1e-4, report_to="tensorboard")


In [None]:
trainer = Trainer(model=pegasus_model_train, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

In [None]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,1.2124,1.37608
1000,1.2464,1.3754
1500,1.0903,1.363006
2000,1.2144,1.362134
2500,1.2179,1.349139
3000,1.195,1.348549
3500,1.1569,1.346475


TrainOutput(global_step=3680, training_loss=1.217842554786931, metrics={'train_runtime': 11575.6479, 'train_samples_per_second': 5.091, 'train_steps_per_second': 0.318, 'total_flos': 2.210989387977523e+16, 'train_loss': 1.217842554786931, 'epoch': 4.0})

## ROUGE score after train

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
rouge_metric = load_metric('rouge')

score = calculate_metric_on_test_ds(
    dataset_samsum['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary')

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )


  rouge_metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

100%|██████████| 410/410 [16:12<00:00,  2.37s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.01832,0.000285,0.018213,0.018252


## Save New Trained Model

In [None]:
## Save model
pegasus_model_train.save_pretrained("/content/drive/MyDrive/Samsum_Pagasus_New_Train/pegasus-samsum-model-new-train")

In [None]:
## Save tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/Samsum_Pagasus_New_Train/new-train-tokenizer")

('/content/drive/MyDrive/Samsum_Pagasus_New_Train/new-train-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Samsum_Pagasus_New_Train/new-train-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Samsum_Pagasus_New_Train/new-train-tokenizer/tokenizer.json')

## Test New Trained model

In [None]:
%cd /content/drive/MyDrive/Samsum_Pagasus_New_Train

/content/drive/MyDrive/Samsum_Pagasus_New_Train


In [None]:
pwd

'/content/drive/MyDrive/Samsum_Pagasus_New_Train'

In [None]:
ls

[0m[01;34mnew-train-tokenizer[0m/  [01;34mpegasus-samsum[0m/  [01;34mpegasus-samsum-model-new-train[0m/


In [None]:
tokenizer = AutoTokenizer.from_pretrained("new-train-tokenizer")

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

pipe = pipeline("summarization", model="pegasus-samsum-model-new-train",tokenizer=tokenizer)


In [None]:
sample_text = dataset_samsum["test"][0]["dialogue"]
print("\nDialogue:")
sample_text


Dialogue:


"Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye"

In [None]:
reference = dataset_samsum["test"][0]["summary"]
print("\nReference Summary:")
reference


Reference Summary:


"Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."

In [None]:
print("\nNew Trained Model Summary (epoch 3):")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)



New Trained Model Summary:
Amanda can't find Betty's number. Larry called her last time they were at the park together. Hannah wants Amanda to text Larry. Amanda will text Larry.


In [None]:
print("\nNew Trained Model Summary (epoch 7):")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)



New Trained Model Summary (epoch 4):
Amanda can't find Betty's number. Larry called her the last time they were at the park together. Hannah wants Amanda to text Larry instead.
