# Text Summary with T5 from Huggingface Tensorflow

In [2]:
import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow_datasets as tfds
import time
import os
import re

import logging
logging.basicConfig(level=logging.ERROR)

### Params

In [4]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-small"

## Define Pretrained Model and Tokenizer

In [5]:
tokenizer = T5Tokenizer.from_pretrained(model_size)

model = TFT5ForConditionalGeneration.from_pretrained(model_size)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))

pad_token_id = tokenizer.pad_token_id

In [6]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

Model: "tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  16449536  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  18881280  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  25176064  
Total params: 60,506,880
Trainable params: 60,506,880
Non-trainable params: 0
_________________________________________________________________


## Load Dataset

In [8]:
from pathlib import Path
import re
import logging
logging.basicConfig(level=logging.ERROR)

tokenizer = T5Tokenizer.from_pretrained(model_size)
pad_token_id = tokenizer.pad_token_id
prefix = "summarize: "

def transfrom(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x


def tokenize_articles(text):
    ids = tokenizer.encode_plus((prefix + text), return_tensors="tf", max_length=512, pad_to_max_length=True) 
    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    y = tokenizer.encode(text, return_tensors="tf", max_length=150, pad_to_max_length=True)
    y = tf.squeeze(y)
    y_ids = y[:-1]
    lm_labels = tf.identity(y[1:])
    lm_labels = tf.where(tf.equal(y[1:],pad_token_id), -100, lm_labels)  

    return y, y_ids, lm_labels


def get_data(name):
    article_path = "../data/%s/articles_german" % name
    highlights_path = "../data/%s/highlights_german" % name

    articles = [transfrom(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transfrom(x.rstrip()) for x in open(highlights_path).readlines()]
    return articles, highlights
    
    
def get_tokinized_ds(articles, highlights):
    x = [] 
    x_mask = []
    for x_i in articles:
        t1, t2 = tokenize_articles(x_i)
        x.append(t1)
        x_mask.append(t2)
        
    y = []
    y_ids = [] 
    y_labels = []
    for y_i in highlights:
        t1, t2, t3 = tokenize_highlights(y_i)
        y.append(t1)
        y_ids.append(t2)
        y_labels.append(t3)
        
        
    return x, x_mask, y, y_ids, y_labels

def get_translated_ds(name):
    articles, highlights = get_data(name)
    return get_tokinized_ds(articles, highlights)

In [9]:
train = get_translated_ds("train")

In [10]:
train_ds = tf.data.Dataset.from_tensor_slices(train)

In [12]:
skip = False
drive_path = "../data"
if not skip:
    # Prepare tf.Examples and tf.Features and write them as TFRecords
    def save_tfrecord_to_bucket(features_dataset, gdrive_folder, file_name):
        with tf.compat.v1.python_io.TFRecordWriter(f"{gdrive_folder}/{file_name}.tfrecord") as tfwriter:
            for train_feature in features_dataset:
                x, x_mask, y, y_ids, y_labels = train_feature
                feature_key_value_pair = {
                    'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
                    'x_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=x_mask)),
                    'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
                    'y_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=y_ids)),
                    'y_labels': tf.train.Feature(int64_list=tf.train.Int64List(value=y_labels))
                }
                features = tf.train.Features(feature=feature_key_value_pair)
                example = tf.train.Example(features=features)

                tfwriter.write(example.SerializeToString())
        print(f"Saved {file_name}.")

    save_tfrecord_to_bucket(train_ds, drive_path, "train_cnn_daily_mail")

Saved train_cnn_daily_mail.


In [70]:
len(val[0])

13368

In [71]:
val_ds = tf.data.Dataset.from_tensor_slices(val)
train_ds = tf.data.Dataset.from_tensor_slices(train)
test_ds = tf.data.Dataset.from_tensor_slices(test)

In [75]:
def write_ds(ds, filename):
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(ds)
    
tf.data.experimental.save(val_ds, "val.tfrecord")

AttributeError: module 'tensorflow._api.v2.data.experimental' has no attribute 'save'

In [77]:
# train_ds = tf.data.Dataset.from_tensor_slices(train)\
#     .map(map_func)\
#     .shuffle(SHUFFEL_SIZE)\
#     .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
#     .prefetch(tf.data.experimental.AUTOTUNE)

val_ds = val_ds\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

# test_ds = tf.data.Dataset.from_tensor_slices(get_translated_ds("test"))\
# .shuffle(SHUFFEL_SIZE)\
# .padded_batch(BATCH_SIZE, padded_shapes=([512],[512],[149],[149]))\
# .prefetch(tf.data.experimental.AUTOTUNE)

In [17]:
for i in val_ds.take(1):
    print(i)

(<tf.Tensor: shape=(2, 512), dtype=int32, numpy=
array([[21603,    10,     3, ...,  6711,    15,    10],
       [    1,     1,     1, ...,     1,     1,     1]], dtype=int32)>, <tf.Tensor: shape=(2, 149), dtype=int32, numpy=
array([[    3,     7,  1427,    21,  6216,     6,   266,     3,  8104,
        15091,    77,    64,     3,    17,  6125,  2558,    77,     6,
           67,    16,   177,  1283,    49,    64,   943,    49,     3,
         8375,    35,    16,  4183,     7,    64,   814,    35,   193,
         5453,    51,    67,    90,    77,   210,   232,     3, 12228,
           15,     6,  2213,   115,   183,  9996,     3,    51,  3185,
          172,     5,    21,  6216,     6,    74,    35,   873,  5808,
           17,     7,  4350,     3,  8682,   760,   630,  2572,  3186,
          615,     6,     3, 28875,    15,  6575,  2995,   177,     3,
          157,    60,   115,     7,     5,    21,  6216,     6,    67,
          403,     3,     7,   152,    67,   839,     3, 26548,  

## Define Train and Validation Step

In [None]:
@tf.function
def train_step(input_ids, input_mask, y_ids, lm_labels):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py

    with tf.GradientTape() as tape:
        # prediction_scores: (bs, 150, 32128)
        # decoder_past_key_value_states: (bs, 512, 512), (bs, 8, 150, 64)
        # z: (bs, 512, 512)
        predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=True)
        loss = loss_object(y[:, 1:], predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(y[:, 1:], predictions)

In [None]:
@tf.function
def val_step(input_ids, input_mask, y_ids, lm_labels):
    # https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py
    
    predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=False)
    v_loss = loss_object(y[:, 1:], predictions)

    val_loss(v_loss)
    val_accuracy(y[:, 1:], predictions)

## Train

In [None]:
EPOCHS = 1
log_interval = 200
for epoch in range(EPOCHS):
    # reset metrics
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    val_loss.reset_states()
    val_accuracy.reset_states()
    
    val_batches = iter(val_ds)
    
    start_time = time.time()
    for i, (input_ids, input_mask, y_ids, labels) in enumerate(train_ds):
        # training
        strategy.run(train_step(input_ids, input_mask, y_ids, labels))
        
        # validation
        if i % log_interval == 0:
            x_val, x_mask_val, y_val, y_label = next(val_batches)
            strategy.run(val_step(x_val, x_mask_val, y_val, y_label))
            elapsed = time.time() - start_time
            print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'train acc {:5.2f} | val acc {:5.2f} |'
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch, i, int(len_train/BATCH_SIZE),
                    elapsed * 1000 / log_interval,
                    train_accuracy.result() * 100, val_accuracy.result() * 100, 
                    train_loss.result(),  val_loss.result()))
            start_time = time.time()



UnavailableError: ignored

## Evaluate

### Define Rouge Score

In [None]:
from rouge_score import rouge_scorer
from rouge_score import scoring

class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

### Compute Rouge Score

In [None]:
predictions = []
rouge_score = RougeScore()
for i, (input_ids, input_mask, y) in enumerate(test_ds):
    summaries = model.generate(input_ids=input_ids, attention_mask=input_mask)

    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    
    for pred_sent, real_sent in zip(pred, real):
        rouge_score(pred_sent, real_sent)
        predictions.append(str("pred sentence: " + pred_sent + "\n\n real sentence: " + real_sent))
        
    if i > 100:
        # otherwise it will take ages
        break


rouge_score.result()

rouge1 = 36.45, 95% confidence [35.93, 36.91]
rouge2 = 16.47, 95% confidence [16.02, 16.93]
rougeLsum = 34.05, 95% confidence [33.53, 34.55]


{'rouge1': 36.4493063524424,
 'rouge2': 16.471916808516053,
 'rougeLsum': 34.052609516834295}

### Predict some Sentences

In [None]:
for pred in predictions[:10]:
    print("------")
    print(pred)
    print("------")     

------
pred sentence: A cave photographer, john spies, 59, captured the sheer magnificence of the vast, yet intricate, underground wonderland . xe bang fai river caves feature imposing stalagmitemade of mineral deposits . the cave is only able to be safely accessed during the dry season from november to april and during this time the water is clear and deep with a rich green hue . in 2008, an expedition led to the mysterious caves being 

 real sentence: the tham khoun ex cave has 15km of spectacular caves waiting to be explore by kayak . explorers can witness the incredible caverns, lake and even the vibrant forest at the entrance . cave photographer john spies captured the labyrinthine chambers to unfold the mystery .
------
------
pred sentence: houston couple, who have not been named, were arguing in the parking lot of an apartment complex . husband admitted to police that during the heat of the argument he tried to drive off in his green chevrolet pickup truck when his wife grabbe