<a href="https://colab.research.google.com/github/ziadtarek66/agoga/blob/main/t5_summarize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score
!pip install datasets transformers

Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.1
    Uninstalling transformers-4.40.1:
      Successfully uninstalled transformers-4.40.1
Successfully installed transformers-4.40.2
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [2]:
# Imports

import torch
import pprint
import evaluate
import numpy as np
from transformers import AutoTokenizer
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [3]:
pp = pprint.PrettyPrinter()

In [4]:
# Prepare Dataset

dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [5]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [6]:
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

In [7]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


In [8]:
# Dataset Analysis

def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(dataset_train['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 3408 words
Artciles larger than 4000 words: 0
Artciles larger than 2000 words: 6
Artciles larger than 1000 words: 20
Artciles larger than 500 words: 343
Longest summary length: 1706 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 0
Summaries larger than 1000 words: 6
Summaries larger than 500 words: 15


In [9]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        sentence_lengths.append(len(corpus))
    return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length(dataset_train['Articles'])
print(f"Average article length: {avg_article_length} words")
avg_summary_length = find_avg_sentence_length(dataset_train['Summaries'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 378.4097807757167 words
Averrage summary length: 164.96233839235526 words


In [10]:
# Configurations

MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

In [11]:
# Tokenization

tokenizer = T5Tokenizer.from_pretrained(MODEL)



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Function to convert text data into model inputs and targets

def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length = MAX_LENGTH,
        truncation = True,
        padding = 'max_length'
    )

    # Set up the tokenizer for targets

    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length = MAX_LENGTH,
            truncation = True,
            padding ='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [13]:
# Apply the function to the whole dataset

tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]



In [14]:
# Model

model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


In [15]:
# ROUGE Metric

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [18]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=2  # Adjust the number of workers
)


In [19]:
tokenizer.save_pretrained(OUT_DIR)

('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [20]:
!zip -r {OUT_DIR} {OUT_DIR}

  adding: results_t5base/ (stored 0%)
  adding: results_t5base/special_tokens_map.json (deflated 85%)
  adding: results_t5base/added_tokens.json (deflated 83%)
  adding: results_t5base/tokenizer_config.json (deflated 94%)
  adding: results_t5base/spiece.model (deflated 48%)


# **Inference**





In [21]:
# Download data.

!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

--2024-05-08 13:34:42--  https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.18, 2620:100:6017:18::a27d:212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc11371138ed97699432590d5336.dl.dropboxusercontent.com/cd/0/inline/CSgmljI_ahRvl_uKKRqctOPd6hpZq-AuucQK1S3SbICwmKNWa0EQnbVZLqMUHWxgguNAXMm0Q0Fpk0ozSSwedDeXWfQ7GULILq4yFCr44LZOZDwQkrSycYPVEzsu6w7b0LAe48IzgAlGT6JlsUxtcCRP/file?dl=1# [following]
--2024-05-08 13:34:43--  https://uc11371138ed97699432590d5336.dl.dropboxusercontent.com/cd/0/inline/CSgmljI_ahRvl_uKKRqctOPd6hpZq-AuucQK1S3SbICwmKNWa0EQnbVZLqMUHWxgguNAXMm0Q0Fpk0ozSSwedDeXWfQ7GULILq4yFCr44LZOZDwQkrSycYPVEzsu6w7b0LAe48IzgAlGT6JlsUxtcCRP/file?dl=1
Resolving uc11371138ed97699432590d5336.dl.dropboxusercontent.com (uc11371138ed97699432590d5336.dl.dropboxusercontent.com).

In [22]:
!unzip inference_data.zip

Archive:  inference_data.zip
  inflating: inference_data/file_1.txt  
  inflating: inference_data/file_2.txt  


In [23]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

import glob

In [24]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "path/to/your/model/directory"
model_checkpoint = "checkpoint-4450"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

OSError: Incorrect path_or_model_id: 'path/to/your/model/directory'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [25]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [26]:
for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75)

("the chatGPT company will get its third CEO in three days. it's another major "
 'shakeup to the balance of power over artificial intelligence.')
---------------------------------------------------------------------------
('the leader of one of the world’s most influential AI companies, openAI, was '
 'fired Friday night by the startup’s board in a surprise move. within about '
 "48 hours, he'd been hired to run a")
---------------------------------------------------------------------------
