In [1]:
import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow_datasets as tfds
import time
import os
import re


In [2]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-small"

In [3]:
tokenizer = T5Tokenizer.from_pretrained(model_size)

model = TFT5ForConditionalGeneration.from_pretrained(model_size)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))

pad_token_id = tokenizer.pad_token_id

In [21]:
from pathlib import Path
import re
import logging
logging.basicConfig(level=logging.ERROR)

tokenizer = T5Tokenizer.from_pretrained(model_size)
pad_token_id = tokenizer.pad_token_id

en_de_prefix = tf.reshape(tokenizer.encode("summarize: en_to_ger ", return_tensors="tf"), (-1,))
de_en_prefix = tf.reshape(tokenizer.encode("summarize: ger_to_en ", return_tensors="tf"), (-1,))
en_en_prefix = tf.reshape(tokenizer.encode("summarize: en_to_en ", return_tensors="tf"), (-1,))
de_de_prefix = tf.reshape(tokenizer.encode("summarize: ger_to_ger ", return_tensors="tf"), (-1,))

In [6]:
tokenizer.eos_token_id

1

In [7]:
print(tokenizer.encode("this is a sentences."))
print(tokenizer.encode("this is a sentences.</s>"))
print(tokenizer.decode([48, 19, 3, 9, 16513, 5, 1]))

[48, 19, 3, 9, 16513, 5]
[48, 19, 3, 9, 16513, 5, 1]
this is a sentences.


In [8]:
def transfrom(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x + "</s>"

def transfrom_en(x):
    x = re.sub("'(.*)'", r"\1", x)
    return x + "</s>"

def tokenize_articles(text):
    ids = tokenizer.encode_plus(text, return_tensors="tf", max_length=(512-8), pad_to_max_length=True) 
    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    y = tokenizer.encode(text, return_tensors="tf", max_length=150, pad_to_max_length=True)
    y = tf.squeeze(y)
    y_ids = y[:-1]
    lm_labels = tf.identity(y[1:])
    lm_labels = tf.where(tf.equal(y[1:],pad_token_id), -100, lm_labels)  

    return y, y_ids, lm_labels


def get_german_data(name):
    article_path = "../data/%s/articles_german" % name
    highlights_path = "../data/%s/highlights_german" % name

    articles = [transfrom(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transfrom(x.rstrip()) for x in open(highlights_path).readlines()]
    return articles, highlights
  
def get_english_data(name):
    article_path = "../data/%s/article" % name
    highlights_path = "../data/%s/highlights" % name

    articles = [transfrom_en(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transfrom_en(x.rstrip()) for x in open(highlights_path).readlines()]
    return articles, highlights
    
def get_tokinized_ds(articles, highlights):
    x = [] 
    x_mask = []
    for x_i in articles:
        t1, t2 = tokenize_articles(x_i)
        x.append(t1)
        x_mask.append(t2)
        
    y = []
    y_ids = [] 
    y_labels = []
    for y_i in highlights:
        t1, t2, t3 = tokenize_highlights(y_i)
        y.append(t1)
        y_ids.append(t2)
        y_labels.append(t3)
        
        
    return x, x_mask, y, y_ids, y_labels

def get_translated_ds(name):
    ger_articles, ger_highlights = get_german_data(name)
    en_articles, en_highlights = get_english_data(name)
    
    return get_tokinized_ds(ger_articles, ger_highlights), get_tokinized_ds(en_articles, en_highlights)

In [9]:
test = get_translated_ds("test")

In [10]:
test_ds = tf.data.Dataset.from_tensor_slices(test)

In [11]:
val = get_translated_ds("val")
val_ds = tf.data.Dataset.from_tensor_slices(val)

In [12]:
skip = False
drive_path = "../data"

# Prepare tf.Examples and tf.Features and write them as TFRecords
def save_tfrecord_to_bucket(features_dataset, gdrive_folder, file_name):
    with tf.compat.v1.python_io.TFRecordWriter(f"{gdrive_folder}/{file_name}.tfrecord") as tfwriter:
        for train_feature in features_dataset:
            (ger_x, ger_x_mask, ger_y, ger_y_ids, ger_y_labels), (en_x, en_x_mask, en_y, en_y_ids, en_y_labels) = train_feature
            feature_key_value_pair = {
                'ger_x': tf.train.Feature(int64_list=tf.train.Int64List(value=ger_x)),
                'ger_x_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=ger_x_mask)),
                'ger_y': tf.train.Feature(int64_list=tf.train.Int64List(value=ger_y)),
                'ger_y_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=ger_y_ids)),
                'ger_y_labels': tf.train.Feature(int64_list=tf.train.Int64List(value=ger_y_labels)),
                'en_x': tf.train.Feature(int64_list=tf.train.Int64List(value=en_x)),
                'en_x_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=en_x_mask)),
                'en_y': tf.train.Feature(int64_list=tf.train.Int64List(value=en_y)),
                'en_y_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=en_y_ids)),
                'en_y_labels': tf.train.Feature(int64_list=tf.train.Int64List(value=en_y_labels))
            }
            features = tf.train.Features(feature=feature_key_value_pair)
            example = tf.train.Example(features=features)

            tfwriter.write(example.SerializeToString())
    print(f"Saved {file_name}.")

save_tfrecord_to_bucket(test_ds, drive_path, "corss_lingual_test_cnn_daily_mail")
save_tfrecord_to_bucket(val_ds, drive_path, "corss_lingual_val_cnn_daily_mail")


Saved corss_lingual_test_cnn_daily_mail.
Saved corss_lingual_val_cnn_daily_mail.


In [13]:
train = get_translated_ds("train")
train_ds = tf.data.Dataset.from_tensor_slices(train) 
save_tfrecord_to_bucket(train_ds, drive_path, "corss_lingual_train_cnn_daily_mail")

Saved corss_lingual_train_cnn_daily_mail.


In [23]:
import numpy as np
MAX_ARTICLE_LEN = 512
MAX_HIGHLIGHT_LEN = 150
bucket = ""
GLOBAL_BATCH_SIZE = 8

def get_tfrecord_dataset(drive_path, file_name):
    features = {
        'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),
        'ger_y_labels': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),

        'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),
        'en_y_labels': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN - 1], tf.int64),
    }

    dataset = tf.data.TFRecordDataset(f"../data/{file_name}.tfrecord")

    # Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example


    def select_data_from_record(record):
        i  = np.random.randint(4) 
        if i == 0:
            return tf.concat([de_de_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(8, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'], record['ger_y_labels']
        elif i == 1:
            return tf.concat([en_de_prefix, record['en_x']], axis=0), tf.concat([tf.ones(8, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'], record['ger_y_labels']
        elif i == 2:
            return tf.concat([de_en_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(8, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'], record['en_y_ids'], record['en_y_labels']
        elif i == 3:
            return tf.concat([en_en_prefix, record['en_x']], axis=0), tf.concat([tf.ones(8, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'], record['en_y_ids'], record['en_y_labels']
 
    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(100)
    return dataset.batch(GLOBAL_BATCH_SIZE)

train_dataset = get_tfrecord_dataset(bucket, "corss_lingual_train_cnn_daily_mail")
train_dataset.prefetch(1024)

validation_dataset = get_tfrecord_dataset(bucket, "corss_lingual_val_cnn_daily_mail")
test_dataset = get_tfrecord_dataset(bucket, "corss_lingual_test_cnn_daily_mail")


In [25]:
for d in train_dataset.take(1):
    print(d)

(<tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[21603,    10,     3, ...,   324,  3348, 14519],
       [21603,    10,     3, ...,    35,    74,     3],
       [21603,    10,     3, ...,    18,  2703,   208],
       ...,
       [21603,    10,     3, ...,  7453,     6,     3],
       [21603,    10,     3, ...,  6394,    93, 12483],
       [21603,    10,     3, ...,     3, 18444, 22186]], dtype=int32)>, <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>, <tf.Tensor: shape=(8, 150), dtype=int32, numpy=
array([[19445,  1064,  2165, ...,     0,     0,     0],
       [ 2318,  3304,  1699, ...,    16,  7457,    11],
       [  736,  1273,  2074, ...,     0,     0,     0],
       ...,
       [ 8100,    52,  7923, ...,     0,     0,     0],
       [27292, 18707,    23, ...,  