In [1]:
!pip install transformers
!pip install sentencepiece

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import tensorflow as tf
print(tf.__version__)

import os
from transformers import ( 
    T5Tokenizer, 
    TFT5Model, 
    TFT5ForConditionalGeneration
)

2.4.1


In [3]:
learning_rate = 3e-5

model_size = "t5-base"

EPOCHS = 1

MAX_ARTICLE_LEN = 512

MAX_HIGHLIGHT_LEN = 150

last_epoch = 0

In [4]:
GLOBAL_BATCH_SIZE = 5

In [5]:
class LanguageTokens:
    def __init__(self, tokenizer, tf_or_pt: str) -> None:
        super().__init__()
        self.en_de_prefix = tokenizer("summarize English to German: ", return_tensors=tf_or_pt).input_ids
        self.de_en_prefix = tokenizer("summarize German to English: ", return_tensors=tf_or_pt).input_ids
        self.en_en_prefix = tokenizer("summarize English to English: ", return_tensors=tf_or_pt).input_ids
        self.de_de_prefix = tokenizer("summarize German to German: ", return_tensors=tf_or_pt).input_ids

        if tf_or_pt == "tf":
            self.en_de_prefix = tf.reshape(self.en_de_prefix, (-1,))
            self.de_en_prefix = tf.reshape(self.de_en_prefix, (-1,))
            self.en_en_prefix = tf.reshape(self.en_en_prefix, (-1,))
            self.de_de_prefix = tf.reshape(self.de_de_prefix, (-1,))
        elif tf_or_pt == "pt":
            self.en_de_prefix = self.en_de_prefix.reshape(-1,)
            self.de_en_prefix = self.de_en_prefix.reshape(-1,)
            self.en_en_prefix = self.en_en_prefix.reshape(-1,)
            self.de_de_prefix = self.de_de_prefix.reshape(-1,)

        # check if last token is end of sequence token and remove it
        if self.en_de_prefix[-1] == 1:
            self.en_de_prefix = self.en_de_prefix[:-1]
            self.de_en_prefix = self.de_en_prefix[:-1]
            self.en_en_prefix = self.en_en_prefix[:-1]
            self.de_de_prefix = self.de_de_prefix[:-1]

        assert self.en_de_prefix.shape[0] == self.de_en_prefix.shape[0] == self.en_en_prefix.shape[0] == self.de_de_prefix.shape[0], "All perfixes must have the same size"
        self.prefix_size = self.en_de_prefix.shape[0]

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
language_tokens = LanguageTokens(tokenizer, "tf")
prefix_size = language_tokens.prefix_size
en_de_prefix = language_tokens.en_de_prefix
de_en_prefix = language_tokens.de_en_prefix
en_en_prefix = language_tokens.en_en_prefix
de_de_prefix = language_tokens.de_de_prefix

In [7]:
import numpy as np
from os import listdir

bucket = ".."

def get_tf_record_files(directory):
    file_list = []
    for item in listdir(directory):
        if item.split(".")[-1] == "tfrecord":
            file_list.append("{}/{}".format(directory, item))
    return file_list

def get_tfrecord_dataset(directory):
    features = {
        'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),

        'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
    }

    tf_records_list = get_tf_record_files(directory)
    print(tf_records_list)
    dataset = tf.data.TFRecordDataset(tf_records_list)

    # TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example


    def select_data_from_record(record):
        i  = tf.random.uniform((1,),0,4,dtype=tf.int32)[0]
        
        if i == 0:
            return tf.concat([de_de_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'], record['ger_y_ids']
        elif i == 1:
            return tf.concat([en_de_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'], record['ger_y_ids']
        elif i == 2:
            return tf.concat([de_en_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'], record['en_y_ids']
        else:
            return tf.concat([en_en_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'], record['en_y_ids']

    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(500000)
    return dataset.batch(GLOBAL_BATCH_SIZE)



train_dataset = get_tfrecord_dataset(bucket + "/data/cnn_daily_mail_train/")
train_dataset.prefetch(1024)

validation_dataset = get_tfrecord_dataset(bucket + "/data/cnn_daily_mail_val/")
# test_dataset = get_tfrecord_dataset(bucket + "/data/sueddeutsche_val/")


['../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-12.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-7.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-11.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-2.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-6.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-5.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-4.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-10.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-3.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-1.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-14.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-15.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual-8.tfrecord', '../data/cnn_daily_mail_train//cnn_daily_mail_multilingual

In [8]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True)
@tf.function
def compute_loss(labels, predictions):
    per_example_loss = loss_object(labels, predictions)
    return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

test_loss_metric = tf.keras.metrics.Mean(name='test_loss')
test_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
train_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy('training_accuracy')

model = TFT5ForConditionalGeneration.from_pretrained(model_size, output_attentions=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
model.summary()

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model: "tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  24674304  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  84954240  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  113275008 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [9]:
@tf.function
def train_step(input_ids, input_mask, y, y_ids):
    with tf.GradientTape() as tape:
        logits = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, training=True)[0]             
        loss = compute_loss(y, logits)

    gradients = tape.gradient(loss, model.trainable_variables)

    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss_metric.update_state(loss)
    train_accuracy_metric.update_state(y, logits)

@tf.function
def val_step(input_ids, input_mask, y, y_ids):
    logits = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, training=False)[0]  
    t_loss = compute_loss(y, logits)

    test_loss_metric.update_state(t_loss)
    test_accuracy_metric.update_state(y, logits)

In [10]:
from tqdm.notebook import tqdm

training_loss_list = []
val_loss_list = []


global_step = 0
for epoch in range(EPOCHS):
    total_loss = 0.0
    training_steps = 10
    epoch_step = 0
    print_every = 1000

    ### Training loop ###
    for input_ids, input_mask, y, y_ids in tqdm(train_dataset, desc="Training"):
        train_step(input_ids, input_mask, y, y_ids)  

        train_loss = train_loss_metric.result().numpy().astype(float)
        training_loss_list.append(train_loss)
        train_accuracy = train_accuracy_metric.result().numpy()

        global_step += 1
        epoch_step += 1

        if epoch_step % print_every == 0:
            print(f"Training step {epoch_step} Accuracy: {train_accuracy}, Training loss: {train_loss}")


    ### Test loop ###
    for input_ids, input_mask, y, y_ids in tqdm(val_dataset, desc="Evaluating"):
        val_step(input_ids, input_mask, y, y_ids)


    ### Output results ###
    test_accuracy = test_accuracy_metric.result().numpy()
    test_loss = test_loss_metric.result().numpy()
    val_loss_list.append(test_loss)
    print(f'Epoch: [{epoch}] Validation loss = {test_loss}')

    ### Reset metrics ###
    test_loss_metric.reset_states()
    train_accuracy_metric.reset_states()
    train_loss_metric.reset_states()
    test_accuracy_metric.reset_states()
    epoch_step = 0

    ### savecheckpoint ###
    save_checkpoint("t5_cnn_daily_mail", epoch)

Training: 0it [00:00, ?it/s]

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fd9db97bf20> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fd9db97bf20> is not a module, class, method, function, traceback, frame, or code object

Training step 1000 Accuracy: 0.7289080023765564, Training loss: 218.87454223632812


KeyboardInterrupt: 