# T5 TPU Tensorflow 

In [None]:
%tensorflow_version 2.x
!pip install transformers



### Imports

We'll only be importing the components that we'll use during this tutorial: the TensorFlow model alongside the model specific tokenizer. The last two imports will manage the pre-processing of our data.

In [None]:
import tensorflow as tf
print(tf.__version__)

import os
from transformers import ( 
    T5Tokenizer, 
    TFT5Model, 
    TFT5ForConditionalGeneration
)

2.2.0


In [None]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-base"

# Pre-processing

### Importing the data

We'll use the handy `tensorflow_datasets` package to import our data. As we are using a TPU we do not have access to our local filesystem, we therefore use a Google Cloud Platform bucket to save our data.

**You will not be able to use our bucket for this notebook. Please create your own and replace the string corresponding to the bucket.**

The data is handled exactly the same way as in the previous tutorial.

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [None]:
from pathlib import Path
import re
import logging
logging.basicConfig(level=logging.ERROR)

tokenizer = T5Tokenizer.from_pretrained(model_size)
pad_token_id = tokenizer.pad_token_id
prefix = "summarize: "

def transfrom(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x


def tokenize_articles(text):
    ids = tokenizer.encode_plus((model.config.prefix + text), return_tensors="tf", max_length=512, pad_to_max_length=True) 
    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    y = tokenizer.encode(text, return_tensors="tf", max_length=150, pad_to_max_length=True)
    y = tf.squeeze(y)
    y_ids = y[:-1]
    lm_labels = tf.identity(y[1:])
    lm_labels = tf.where(tf.equal(y[1:],pad_token_id), -100, lm_labels)  

    return y, y_ids, lm_labels


def get_data(name):
    article_path = "/gdrive/My Drive/cnn_daily_mail/%s/articles_german" % name
    highlights_path = "/gdrive/My Drive/cnn_daily_mail/%s/highlights_german" % name

    articles = [transfrom(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transfrom(x.rstrip()) for x in open(highlights_path).readlines()]
    return articles, highlights
    
    
def get_tokinized_ds(articles, highlights):
    x = [] 
    x_mask = []
    for x_i in articles:
        t1, t2 = tokenize_articles(x_i)
        x.append(t1)
        x_mask.append(t2)
        
    y = []
    y_ids = [] 
    y_labels = []
    for y_i in highlights:
        t1, t2, t3 = tokenize_highlights(y_i)
        y.append(t1)
        y_ids.append(t2)
        y_labels.append(t3)
        
        
    return x, x_mask, y, y_ids, y_labels

def get_translated_ds(name):
    articles, highlights = get_data(name)
    return get_tokinized_ds(articles, highlights)

In [None]:
val = get_translated_ds("val")
test = get_translated_ds("test")

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices(val)
test_ds = tf.data.Dataset.from_tensor_slices(test)


### Serialization

Here we are using [TFRecord alongside tf.Example](https://www.tensorflow.org/tutorials/load_data/tfrecord) as a way to read data efficiently. Feeding data to a TPU can very easily be a bottleneck, we therefore store our data in a file that can be used during training.

**Unless you change the bucket to your own, you will not be able to run this cell as we have not given public access to write on our public folder. If you change this cell to your own bucket in order to run it, you will have to change the URL from which you download the TFRecord to your bucket URL.**

In [None]:
skip = True
drive_path = "/gdrive/My Drive/cnn_daily_mail"
if not skip:
    # Prepare tf.Examples and tf.Features and write them as TFRecords
    def save_tfrecord_to_bucket(features_dataset, gdrive_folder, file_name):
        with tf.compat.v1.python_io.TFRecordWriter(f"{gdrive_folder}/{file_name}.tfrecord") as tfwriter:
            for train_feature in features_dataset:
                x, x_mask, y, y_ids, y_labels = train_feature
                feature_key_value_pair = {
                    'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
                    'x_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=x_mask)),
                    'y': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
                    'y_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=y_ids)),
                    'y_labels': tf.train.Feature(int64_list=tf.train.Int64List(value=y_labels))
                }
                features = tf.train.Features(feature=feature_key_value_pair)
                example = tf.train.Example(features=features)

                tfwriter.write(example.SerializeToString())
        print(f"Saved {file_name}.")

    save_tfrecord_to_bucket(val_ds, drive_path, "val_cnn_daily_mail")
    save_tfrecord_to_bucket(test_ds, drive_path, "test_cnn_daily_mail")
    save_tfrecord_to_bucket(train_ds, drive_path, "train_cnn_daily_mail")


Saved val_cnn_daily_mail.
Saved test_cnn_daily_mail.


# Building the training system

## Strategy

We make use of TensorFlow's strategies, which handle the data distribution as well as the distributed training that happens on the devices available. In this example we'll be using a `MirroredStrategy` which can be used to train on a multiple GPUs in a distributed manner. 

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.91.63.162:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.91.63.162:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


<tensorflow.python.tpu.topology.Topology at 0x7f9fc963ae10>

In [None]:
strategy = tf.distribute.experimental.TPUStrategy(tpu)
print("Number of accelerators: ", strategy.num_replicas_in_sync)

INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Number of accelerators:  8


## Loading the Dataset with the strategy

Here we define a batch size for each replica. We set it to be a multiple of 8 to best leverage the systolic array as defined in the [Google TPU performance guide](https://cloud.google.com/tpu/docs/performance-guide#rule_of_thumb_pick_efficient_values_for_batch_and_feature_dimensions).

In [None]:
BATCH_SIZE_PER_REPLICA = 6
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
EPOCHS = 5

MAX_ARTICLE_LEN = 512
MAX_HIGHLIGHT_LEN = 150

In [None]:
GLOBAL_BATCH_SIZE

48

### Retrieving the TFRecord dataset

The TFRecord dataset is now entirely processed and ready to be used as input by our training loop. We load it, shuffle it and batch it.

In [None]:
bucket = "gs://tpu-bucket-cnn-daily-mail"

def get_tfrecord_dataset(drive_path, file_name):
    features = {
        'x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN], tf.int64),
        'x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN], tf.int64),
        'y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),
        'y_labels': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),
    }

    dataset = tf.data.TFRecordDataset(f"{drive_path}/{file_name}.tfrecord")

    # Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example


    def select_data_from_record(record):
        return record['x'], record['x_mask'], record['y'], record['y_ids'], record['y_labels']


    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(100)
    return dataset.batch(GLOBAL_BATCH_SIZE)

train_dataset = get_tfrecord_dataset(bucket, "train_cnn_daily_mail")
train_dataset.prefetch(1024)

validation_dataset = get_tfrecord_dataset(bucket, "val_cnn_daily_mail")
test_dataset = get_tfrecord_dataset(bucket, "test_cnn_daily_mail")


There is an additional step here to distribute the dataset among the different TPU cores. We make use of a strategy method to do so.

Every item held in the dataset (which is a batched dataset) will now be split over the TPU workers. As the TPU we're using has 8 workers and our batch is of size 64, every example will be evenly split in batches of (64 / 8 =) 8 and distributed across workers.

In [None]:
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
validation_dist_dataset = strategy.experimental_distribute_dataset(validation_dataset)

## Model creation

We create a function that will instantiate a new model when called.

In [None]:
def model_fn():
    return TFT5ForConditionalGeneration.from_pretrained(model_size)

## Hyperparameters initialization

While in the strategy's scope, we define a sparse categorical crossentropy loss. We define a method `compute_loss` which will be called to compute the loss between the model's prediction and the expected result (or label).

In order to measure the accuracy during training and evaluation, we define two metrics which are both sparse categorical accuracy.

Finally, we initialize a model and create an optimizer object using the Adam optimizer.


In [None]:
with strategy.scope():
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True)

    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    test_loss_metric = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
    train_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy('training_accuracy')
    
    model = model_fn()
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=892146080.0, style=ProgressStyle(descri…





If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


## Steps

We create two functions that will be called during the training and test steps. 

In [None]:
with strategy.scope():
    def train_step(inputs):
        input_ids, input_mask, y, y_ids, lm_labels = inputs

        with tf.GradientTape() as tape:
            predictions = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=True)[0]  # Gather only the outputs of the text-classification head
            loss = compute_loss(y[:, 1:], predictions)

        gradients = tape.gradient(loss, model.trainable_variables)

        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_loss_metric.update_state(loss)
        train_accuracy_metric.update_state(y[:, 1:], predictions)

    def test_step(inputs):
        input_ids, input_mask, y, y_ids, lm_labels = inputs

        predictions = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=False)[0]  # Gather only the outputs of the text-classification head
        t_loss = compute_loss(y[:, 1:], predictions)

        test_loss_metric.update_state(t_loss)
        test_accuracy_metric.update_state(y[:, 1:], predictions)

## Training & Evaluation

Finally, using all the previously defined attributes, we create two traced tf.function which will execute the training and test steps in a distributed manner. There is no need for them to return anything as the metrics will directly be updated in the steps described beforehand.

We loop over the number of epochs, training the model and evaluating it at the end of each epoch.

In [None]:
from tqdm import tqdm

with strategy.scope():
    @tf.function
    def distributed_train_step(dataset):
        strategy.run(train_step, args=(dataset,))
 

    @tf.function
    def distributed_test_step(dataset):
        strategy.run(test_step, args=(dataset,))


    global_step = 0
    for epoch in range(EPOCHS):
        total_loss = 0.0
        training_steps = 10
        epoch_step = 0
        print_every = 1000

        ### Training loop ###
        for tensor in tqdm(train_dist_dataset, desc="Training"):
            distributed_train_step(tensor)  

            train_loss = train_loss_metric.result().numpy().astype(float)
            train_accuracy = train_accuracy_metric.result().numpy()

            global_step += 1
            epoch_step += 1

            if epoch_step % print_every == 0:
                print(f"Training step {epoch_step} Accuracy: {train_accuracy}, Training loss: {train_loss}")


        ### Test loop ###
        for tensor in tqdm(validation_dist_dataset, desc="Evaluating"):
            distributed_test_step(tensor)
            
        
        ### Output results ###
        test_accuracy = test_accuracy_metric.result().numpy()
        test_loss = test_loss_metric.result().numpy()
        print(f'Epoch: [{epoch}] Validation accuracy = {test_accuracy}')

        ### Reset metrics ###
        test_loss_metric.reset_states()
        train_accuracy_metric.reset_states()
        train_loss_metric.reset_states()
        test_accuracy_metric.reset_states()
        epoch_step = 0

        

Training: 0it [00:00, ?it/s]

Instructions for updating:
renamed to `run`


Instructions for updating:
renamed to `run`
Training: 1000it [10:11,  2.11it/s]

Training step 1000 Accuracy: 0.6948304176330566, Training loss: 31.317729949951172


Training: 2000it [17:51,  2.20it/s]

Training step 2000 Accuracy: 0.7141307592391968, Training loss: 28.346038818359375


Training: 3000it [25:29,  2.19it/s]

Training step 3000 Accuracy: 0.723595142364502, Training loss: 26.952781677246094


Training: 4000it [33:09,  2.18it/s]

Training step 4000 Accuracy: 0.7301008105278015, Training loss: 26.04168701171875


Training: 5000it [40:46,  2.20it/s]

Training step 5000 Accuracy: 0.7349156737327576, Training loss: 25.38692855834961


Training: 5982it [49:56,  2.00it/s]
Evaluating: 279it [01:20,  3.47it/s]
Training: 0it [00:00, ?it/s]

Epoch: [0] Validation accuracy = 0.7656062245368958


Training: 1000it [07:39,  2.20it/s]

Training step 1000 Accuracy: 0.7605858445167542, Training loss: 21.97714614868164


Training: 2000it [15:18,  2.14it/s]

Training step 2000 Accuracy: 0.7610371112823486, Training loss: 21.90743637084961


Training: 3000it [22:56,  2.18it/s]

Training step 3000 Accuracy: 0.7617295980453491, Training loss: 21.813257217407227


Training: 4000it [30:35,  2.19it/s]

Training step 4000 Accuracy: 0.7626721262931824, Training loss: 21.69682502746582


Training: 5000it [38:14,  2.16it/s]

Training step 5000 Accuracy: 0.7635508179664612, Training loss: 21.592601776123047


Training: 5982it [45:45,  2.18it/s]
Evaluating: 279it [00:43,  6.47it/s]
Training: 0it [00:00, ?it/s]

Epoch: [1] Validation accuracy = 0.7739362716674805


Training: 1000it [07:42,  2.19it/s]

Training step 1000 Accuracy: 0.7699467539787292, Training loss: 20.833740234375


Training: 2000it [15:19,  2.19it/s]

Training step 2000 Accuracy: 0.7697618007659912, Training loss: 20.84115219116211


Training: 3000it [23:03,  2.15it/s]

Training step 3000 Accuracy: 0.769922137260437, Training loss: 20.804471969604492


Training: 4000it [30:46,  2.19it/s]

Training step 4000 Accuracy: 0.7704637050628662, Training loss: 20.73806381225586


Training: 5000it [38:24,  2.18it/s]

Training step 5000 Accuracy: 0.770969808101654, Training loss: 20.67778205871582


Training: 5982it [45:53,  2.17it/s]
Evaluating: 279it [00:42,  6.62it/s]
Training: 0it [00:00, ?it/s]

Epoch: [2] Validation accuracy = 0.778163492679596


Training: 1000it [07:39,  2.18it/s]

Training step 1000 Accuracy: 0.7751995325088501, Training loss: 20.200170516967773


Training: 2000it [15:18,  2.20it/s]

Training step 2000 Accuracy: 0.774742603302002, Training loss: 20.23051643371582


Training: 3000it [22:56,  2.19it/s]

Training step 3000 Accuracy: 0.774769127368927, Training loss: 20.21131706237793


Training: 4000it [30:37,  2.18it/s]

Training step 4000 Accuracy: 0.7751159071922302, Training loss: 20.16559600830078


Training: 5000it [38:20,  2.15it/s]

Training step 5000 Accuracy: 0.7754877805709839, Training loss: 20.121145248413086


Training: 5982it [45:57,  2.17it/s]
Evaluating: 279it [00:43,  6.38it/s]
Training: 0it [00:00, ?it/s]

Epoch: [3] Validation accuracy = 0.780969500541687


Training: 1000it [07:47,  2.10it/s]

Training step 1000 Accuracy: 0.7786505818367004, Training loss: 19.749431610107422


Training: 2000it [15:32,  2.14it/s]

Training step 2000 Accuracy: 0.7782138586044312, Training loss: 19.78450584411621


Training: 3000it [23:21,  2.16it/s]

Training step 3000 Accuracy: 0.778134822845459, Training loss: 19.780961990356445


Training: 4000it [31:08,  2.14it/s]

Training step 4000 Accuracy: 0.7784538865089417, Training loss: 19.744007110595703


Training: 5000it [38:55,  2.15it/s]

Training step 5000 Accuracy: 0.7787538170814514, Training loss: 19.709964752197266


Training: 5982it [46:34,  2.14it/s]
Evaluating: 279it [00:43,  6.40it/s]


Epoch: [4] Validation accuracy = 0.7828335762023926


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
ckpt_file = os.path.join(bucket, "checkpoint.ckpt")

In [None]:
model.save_weights(ckpt_file) 


In [None]:
model.load_weights(ckpt_file)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9fbb8e6c18>