# Cross Lingual summarization CNN Daily Mail Results
We will try out the trained t5 network from the tpu

In [1]:
import tensorflow as tf
import pandas as pd
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import time
from rouge_score import rouge_scorer
from rouge_score import scoring

In [2]:
if not tf.config.list_physical_devices('GPU'):
    print("Change runtime to \"GPU runtime\" for faster computations")

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

## Params

In [4]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-base"

MAX_ARTICLE_LEN = 512

MAX_HIGHLIGHT_LEN = 150

## Model

In [5]:
model = TFT5ForConditionalGeneration.from_pretrained(model_size)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [6]:
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

Model: "tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  24674304  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  84954240  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  113275008 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [7]:
ckpt_file = "../models/t5_cnn_daily_mail-7.ckpt"
model.load_weights(ckpt_file)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcc54c90da0>

## Dataset
We will load the translated CNN Daily Mail dataset from the tfrecords files

In [8]:
class LanguageTokens:
    def __init__(self, tokenizer, tf_or_pt: str) -> None:
        super().__init__()
        self.en_de_prefix = tokenizer("summarize English to German: ", return_tensors=tf_or_pt).input_ids
        self.de_en_prefix = tokenizer("summarize German to English: ", return_tensors=tf_or_pt).input_ids
        self.en_en_prefix = tokenizer("summarize English to English: ", return_tensors=tf_or_pt).input_ids
        self.de_de_prefix = tokenizer("summarize German to German: ", return_tensors=tf_or_pt).input_ids

        if tf_or_pt == "tf":
            self.en_de_prefix = tf.reshape(self.en_de_prefix, (-1,))
            self.de_en_prefix = tf.reshape(self.de_en_prefix, (-1,))
            self.en_en_prefix = tf.reshape(self.en_en_prefix, (-1,))
            self.de_de_prefix = tf.reshape(self.de_de_prefix, (-1,))
        elif tf_or_pt == "pt":
            self.en_de_prefix = self.en_de_prefix.reshape(-1,)
            self.de_en_prefix = self.de_en_prefix.reshape(-1,)
            self.en_en_prefix = self.en_en_prefix.reshape(-1,)
            self.de_de_prefix = self.de_de_prefix.reshape(-1,)

        # check if last token is end of sequence token and remove it
        if self.en_de_prefix[-1] == 1:
            self.en_de_prefix = self.en_de_prefix[:-1]
            self.de_en_prefix = self.de_en_prefix[:-1]
            self.en_en_prefix = self.en_en_prefix[:-1]
            self.de_de_prefix = self.de_de_prefix[:-1]

        assert self.en_de_prefix.shape[0] == self.de_en_prefix.shape[0] == self.en_en_prefix.shape[0] == self.de_de_prefix.shape[0], "All perfixes must have the same size"
        self.prefix_size = self.en_de_prefix.shape[0]

In [9]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
language_tokens = LanguageTokens(tokenizer, "tf")
prefix_size = language_tokens.prefix_size
prefix_size

5

In [31]:
import numpy as np
from os import listdir
MAX_ARTICLE_LEN = 512
MAX_HIGHLIGHT_LEN = 150
GLOBAL_BATCH_SIZE = 8

def get_tf_record_files(directory):
    file_list = []
    for item in listdir(directory):
        if item.split(".")[-1] == "tfrecord":
            file_list.append("{}/{}".format(directory, item))
    return file_list

def get_tfrecord_dataset(folder):
    features = {
        'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),

        'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
    }
    
    dataset = tf.data.TFRecordDataset(get_tf_record_files(folder))

    # Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example

    def select_data_from_record(record):
        return [
            tf.concat([language_tokens.de_de_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'],
            tf.concat([language_tokens.en_de_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'],
            tf.concat([language_tokens.de_en_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'], record['en_y_ids'],
            tf.concat([language_tokens.en_en_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'], record['en_y_ids'],
        ]
    
    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(100)
    return dataset.batch(GLOBAL_BATCH_SIZE)

root_folder = "../data/"
test_ds = get_tfrecord_dataset(root_folder + "cnn_daily_mail_test/")

In [32]:
def get_summaries(ds):
    for i in range(1,5):
        yield ds[(i-1)*4], ds[i*4-3], ds[i*4-2], ds[i*4-1]


for ds in test_ds.take(1):
    for i in get_summaries(ds):
        print(i[0].shape, i[1].shape, i[2].shape, i[3].shape)

(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)


## Evaluation
### Define Rouge Score

In [33]:
class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

### Compute Summaries

In [34]:
predictions = []
start_time = time.time()

for i, ds_item in enumerate(test_ds): 
    for (input_ids, input_mask, y, y_ids) in get_summaries(ds_item):
        summaries = model.generate(
            input_ids=input_ids, 
            attention_mask=input_mask, 
            num_beams=4, 
            length_penalty=0.6,
            early_stopping=True, 
            max_length=150
        )

        articles = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in input_ids]

        pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
        real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    
        for pred_sent, real_sent, article_sent in zip(pred, real, articles):
            predictions.append(str("article: " + article_sent + "\n\npred sentence: " + pred_sent + "\n\nreal sentence: " + real_sent))
    
    if (i % 10) == 0:
        elapsed = (time.time() - start_time) / 10
        print(i,": time genreate batch:", elapsed)
        start_time = time.time()
    if i > 4:
        # otherwise it will take ages
        break


# rouge_score.result()

0 : time genreate batch: 16.21683449745178


### Lets have a look at some of these predicted summaries

In [14]:
import numpy as np
len_predictions = len(predictions)

def get_random_prediction():
    return predictions[np.random.randint(len_predictions)]

In [15]:
print(get_random_prediction())

article: summarize German to English: Premierminister Boris Johnson treibt den Brexit-Streit mit dem britischen Parlament weiter auf die Spitze. Medienberichten zufolge ist er dazu bereit, sich über das vom Unterhaus verabschiedete Gesetz hinwegzusetzen, das einen No-Deal-Brexit verhindern soll. Nach dem Willen des Parlaments muss die britische Regierung die EU um eine Verlängerung der Austrittsfrist bitten, sollte es beim EU-Gipfel am 17./18. Oktober keine Einigung auf einen Brexit-Vertrag geben. Aus Downing Street verlautete nun jedoch, dass Johnson dieses Gesetz "sabotieren" wolle. Im engsten Umfeld des Premierministers sei man darauf vorbereitet, "die Kettensäge an alles anzulegen", was Johnsons Brexit-Kurs im Weg stehe, berichtete die Sunday Times. Johnson ist noch immer fest entschlossen, Großbritannien am 31. Oktober, dem bisherigen Austrittstermin, aus der Europäischen Union zu führen - und sei es ohne Abkommen. Sollte die Opposition ihn dann wegen Missachtung des Gesetzes vor 

In [16]:
print(get_random_prediction())

article: summarize English to English: "Today I announce my intention to apply Israeli sovereignty to the Jordan Valley and the North Dead Sea with the formation of the next government," Israeli Prime Minister Benjamin Netanyahu said. A week before Israels parliamentary elections, Prime Minister Benjamin Netanyahu has announced that, if he wins, he will annex the Jordan Valley. " Today I announce my intention to apply Israeli sovereignty to the Jordan Valley and the North Dead Sea with the formation of the next government, "Netanyahu said Tuesday. He showed a map that described the area as "Israel's eastern border." Netanyahu promised to take necessary steps immediately after the coalition was formed. He wanted to extend sovereignty to all Jewish settlements. The right-wing Likud politician spoke of a "historic opportunity" as US President Donald Trump announced his Middle East peace plan shortly after Israel's election. Netanyahu did not say whether he had colluded with Trump. However

In [17]:
print(get_random_prediction())

article: summarize English to German: Bearded Taliban leaders drinking tea on the terrace of Camp David on the eve of the anniversary of 9 / 11 - a notion that many US citizens find bizarre. The fact that the secret meeting that Trump made public only with his cancellation did not take place, however, is good news not only out of piety for the victims of the terrorist attacks in New York and Washington. Peace in Afghanistan will only be achieved through negotiation. Not even hawks like security adviser John Bolton, who rely on arms rather than words to resolve conflicts, dispute that. Many doubt, however, that Trump could have achieved a viable peace with a spectacular meeting on US soil - the danger would have been high that the US president, who is already half-way through his campaign, would have committed himself to a deal whose consequences are hardly foreseeable. Trump wants to prove to his base that he can bring the troops home from the Hindu Kush after 18 years. This is undoubt

In [18]:
print(get_random_prediction())

article: summarize German to English: Die USA hatten offenbar in Russland über Jahre einen Spion in der Kremlbürokratie platziert, der auch Zugang zu Präsident Wladimir Putin hatte. Dieser "Maulwurf" sei aber 2017 von der CIA abgezogen worden, das haben die New York Times und der Sender CNN am Dienstag veröffentlicht. Den Berichten nach war der Spion über Jahrzehnte in der russischen Regierungsverwaltung immer weiter aufgestiegen und hatte schließlich sogar die Möglichkeit, Akten auf Putins Schreibtisch zu fotografieren. Laut New York Times war er maßgebliche Quelle der Informationen, aus denen die US-Geheimdienste den Schluss zogen, dass Putin selbst die russische Einmischung in den US-Präsidentschaftswahlkampf 2016 begleitete. Die russische Zeitung Kommersant schrieb, es handle sich bei dem Maulwurf um einen damaligen Beamten im Moskauer Präsidialamt namens Oleg Smolenkow, der für einen der Chefberater Putins gearbeitet habe. Smolenkow habe sich bei einem Montenegro-Urlaub mit der Fa

In [19]:
print(get_random_prediction())

article: summarize German to English: Es ist gut, dass der Scharfmacher John Bolton nicht mehr Sicherheitsberater des US-Präsidenten ist. Die amerikanische Außenpolitik wird dadurch aber nicht vernünftiger. Es ist kein Verlust, dass John Bolton nicht mehr Sicherheitsberater des Präsidenten der Vereinigten Staaten ist. Bolton ist einer jener konservativen, patriotischen Politiker, die sich einst davor gedrückt haben, in den Krieg nach Vietnam zu gehen, deren Markenzeichen es später dann aber unter anderem war, dass sie sehr nonchalant darüber zu plaudern wussten, in welche Kriege man anderer Leute Kinder schicken könnte. Wo immer Bolton ein Problem in der Welt sah, erschien es ihm als Nagel, auf den Amerika den Hammer seines Militärs niedersausen lassen sollte. Ginge es nach Bolton, dann befänden sich die USA in diesem Moment vermutlich sowohl mit Nordkorea als auch mit Iran in einer militärischen Konfrontation. Insofern sollte man Donald Trump, der ja nicht viel Lob abbekommt oder verd

In [20]:
for i in range(10):
    print(get_random_prediction())

article: summarize English to German: On her twelfth visit, the Chancellor is experiencing the speed with which life is changing in China. After the official talks in Beijing, she visits a German-Chinese hospital - and talks students into their consciences. In Wuhan, the ATM is in front of the emergency room. That sounds rather sober, and that is exactly what it is. Anyone who falls ill or has an accident is first asked to pay in this hospital. In order to get an appointment with one of the doctors, he must not only feed his medical data into the vending machine before the consultation hour, but also prove his creditworthiness. More specifically, he needs to prove what his account is for. The extract, which he draws from the vending machine, he will present to the doctor, so that he (or she) decides on the appropriate treatment. Injection, surgery and care according to the financial situation - this is the situation at the German-Chinese Friendship Hospital in Wuhan. Now it would be wr

## Save results to text file

In [35]:
result_path = "../results/cnn_daily_mail_result.txt"
open(result_path, "w")
for pred in predictions:
    with open(result_path, "a") as file:
        file.write(pred + "\n")

## Load save File

In [36]:
data_points = []
result_path = "../results/cnn_daily_mail_result.txt"
file = open(result_path, "r")
for line in file:
    data_points.append(line)

In [37]:
data_points[0]

'article: summarize German to German: Bei noch sechs ausstehenden Spielen - fünf für QPR und sieben für Leicester - trennen die letzten fünf Teams in der Premier League nur vier Punkte. Nach Lage der Dinge würden die drei aufgestiegenen Clubs, Leicester, Burnley und QPR, sofort in die Meisterschaft zurückkehren, aber Hull und Sunderland stehen am Rande des Abgrunds. Auch Aston Villa und West Brom sind noch nicht ganz aus dem Rennen, obwohl unsere Reporter zuversichtlich sind, dass die Absteiger aus den fünf Mannschaften kommen werden, die derzeit am unteren Ende der Tabelle stehen. Charlie Austin hofft Medicare in Sicherheit zu bringen, aber werden seine Ziele ausreichen, um sie zu retten? Kann sich einer der letzten drei in Sicherheit bringen? Oder werden die Klubs über ihnen teuflisch schwierigen Fixierlisten erliegen? angs Reporter urteilen über den engsten Abstiegskampf seit Jahren... Lee Clayton. Leicester, QPR und Sunderland. Kann ich eine IF hinzufügen? Wenn Danny Ings wieder re

In [38]:
class SummaryData():
    
    def __init__(self):
        self.language_tag = ''
        self.real_data = ''
        self.pred_data = ''        

In [39]:
count = 0
summary_data = SummaryData()
summary_data_list = []
for point in data_points:
    count += 1
    
    if count == 1:
        summary_data.language_tag = " ".join(point.split(" ")[2:5])
    elif count == 3:
        summary_data.pred_data = ": ".join(point.split(":")[1:])
    elif count == 5:
        summary_data.real_data = ": ".join(point.split(":")[1:])
        summary_data_list.append(summary_data)
        summary_data = SummaryData()
        count = 0
summary_data_list[0].__dict__

{'language_tag': 'German to German:',
 'real_data': ' Auch Hull City und Sunderland sind in Abstiegsgefahr. Aston Villa und West Brom sind näher an der Sicherheit, könnten aber eingeholt werden.\n',
 'pred_data': ' Leicester, QPR und Sunderland stehen am Rande des Abstiegskampfes. Hull, Newcastle und Aston Villa stehen an der Spitze der Tabelle. Charlie Austin hofft, Medicare in Sicherheit zu bringen, aber werden seine Ziele ausreichen, um sie zu retten?\n'}

In [40]:
from ast import literal_eval

results_en_trans = []
results_en_path = "../results/en_en_results"
file = open(results_en_path, "r")
for i, line in enumerate(file):
    results_en_trans.append(literal_eval(line))
    
results_ger_trans = []
results_ger_path = "../results/en_en_results"
file = open(results_ger_path, "r")
for i, line in enumerate(file):
    results_ger_trans.append(literal_eval(line))

In [41]:
rouge_scores_dict = dict()
rouge_scores_dict['English to English:'] = RougeScore()
# rouge_scores_dict['en_to_en_trans'] = RougeScore()

rouge_scores_dict['English to German:'] = RougeScore()
rouge_scores_dict['German to English:'] = RougeScore()
rouge_scores_dict['German to German:'] = RougeScore()
# rouge_scores_dict['ger_to_ger_trans'] = RougeScore()

rouge_scores_dict

{'English to English:': <__main__.RougeScore at 0x7fcc54c90198>,
 'English to German:': <__main__.RougeScore at 0x7fcae87c5da0>,
 'German to English:': <__main__.RougeScore at 0x7fcae87c5b00>,
 'German to German:': <__main__.RougeScore at 0x7fcae87c5a20>}

In [42]:
for summary_data in summary_data_list:
    rouge_scores_dict[summary_data.language_tag](summary_data.real_data, summary_data.pred_data)
    

In [43]:
results_ger_trans[0]

{'id': 24,
 'item_id': 1,
 'language_tag': 'en_to_en',
 'real_data': 'Beatrice seen watching race on terrace with the Gulf states Crown Prince . Marks 13th holiday since November last year, and fourth in a month . Princess quit her job at Sony Pictures in New York before Christmas . Despite that she is described as working full-time on her fathers website .\n',
 'pred_data': 'Princess Beatrice spotted at Bahrain Grand Prix with long-term boyfriend Dave Clark . Onlooker said 26-year-old was walking behind the Crown Prince of Bahrain . Princes regime accused of violently repressing pro-democracy protests . Sir Jackie Stewart and comedian Rory Bremner watched race from the tower .\n',
 'translated': 'Prinzessin Beatrice beim Großen Preis von Bahrain mit ihrem langjährigen Freund Dave Clark.Beobachter sagten, der 26-Jährige sei hinter dem Kronprinzen von Bahrain hergelaufen.Prinzen-Regime beschuldigt, prodemokratische Proteste gewaltsam zu unterdrücken.Sir Jackie Stewart und Komiker Rory B

In [44]:
for key, rouge_score_item in rouge_scores_dict.items():
    print(key, rouge_score_item.result())
    print()

rouge1 = 39.12, 95% confidence [36.03, 42.40]
rouge2 = 19.13, 95% confidence [15.74, 22.88]
rougeLsum = 36.54, 95% confidence [33.46, 39.58]
English to English: {'rouge1': 39.12285489363039, 'rouge2': 19.12811348631634, 'rougeLsum': 36.54423892349275}

rouge1 = 32.83, 95% confidence [29.78, 36.22]
rouge2 = 13.05, 95% confidence [10.45, 16.27]
rougeLsum = 23.02, 95% confidence [20.10, 26.32]
English to German: {'rouge1': 32.83025864315528, 'rouge2': 13.05114994563904, 'rougeLsum': 23.017552524051553}

rouge1 = 35.60, 95% confidence [32.66, 38.40]
rouge2 = 14.47, 95% confidence [12.27, 16.67]
rougeLsum = 33.62, 95% confidence [30.67, 36.26]
German to English: {'rouge1': 35.603094673092706, 'rouge2': 14.47399016291358, 'rougeLsum': 33.61529228993784}

rouge1 = 33.97, 95% confidence [31.44, 36.88]
rouge2 = 15.65, 95% confidence [12.68, 18.63]
rougeLsum = 25.12, 95% confidence [22.52, 28.13]
German to German: {'rouge1': 33.97223830544936, 'rouge2': 15.646719696637163, 'rougeLsum': 25.120670

# Special Example

In [29]:
input_text = "In a world where we have to read and understand a lot of documents automatic text summarization has an obvious demand. To have the option to get a brief summary of a text in your language can be very useful. But would it not be even more useful, if you could have the option to get a summary of a text in a language, you do not understand, in your wished language? Sometimes we don’t want an exact translation, sometimes we just want to know a brief overview of a text in a language we don’t understand. That is the case where Cross-Lingual summarization would be prefered over normal translation. Cross-Lingual summarization can give you a short overview of a text in a language you do not understand."

In [30]:
input_text_test = "The dramatic growth of data on the internet leads to the need to automatically process and understand the data. A big part of the data is text data in many languages. This overwhelming amount of information causes a demand for automatic text summarization and other Natural Language Processing (NLP) taks.\n The Field of NLP had some high points in the last couple of years, when the field got revolutionized by Neural Language Models. With publications like Attention is all you need \cite{vaswani2017attention} or GPT3\cite{brown2020language} the limits of the field are pushed even further. Because of large data corpuses, scraped from the internet, and advanced models which contain up to 175 Billion parameters like the GPT3 it is possible to generate text, answer questions, summarize text, translate or many other things.\n In this master thesis we will take a deeper look into summarization. There are extractive and abstractive techniques to summarization. The extractive summary technique tries to find subsets of sentences, which representante the original text well and uses them to summarize the original text \cite{allahyari2017text}. The abstractive technique uses advanced language models to generate a new text, which should be much shorter than the original one, contain all the key information and preserve the overall meaning.\n In this master thesis we will use the abstractive technique, because it is closer to a human-like interpretation. It combines the ability to understand what the context of a given text is and the ability to generate fluent and grammatically correct text to that given context.\n The summarization in one language is an interesting topic but cross lingual approach is even more interesting. The goal of Cross Lingual Summarization is to summarize a text from one language into another language. It combines the ability to summarize and the ability to translate. Where state of the art models perform well on normal summarization, it will be interesting to see how well they perform doing Cross-Lingual Summarization."

In [31]:
x = tokenizer.encode_plus("summarize: en_to_ger " + input_text, max_length=512, return_tensors="tf", padding='max_length', truncation=True)
input_ids = tf.reshape(x['input_ids'], (1,-1))
attention_mask = tf.reshape(x['attention_mask'], (1,-1))

In [32]:
print(input_ids.shape, attention_mask.shape)

(1, 512) (1, 512)


In [33]:
summaries = model.generate( 
    input_ids=input_ids, 
    attention_mask=attention_mask
)

In [34]:
summaries

<tf.Tensor: shape=(1, 74), dtype=int32, numpy=
array([[    0,    86,   645,  3779,     6,    16,    74,   558,  2584,
            3, 20127,    15,   110,    35,    64, 19163,  3766,     6,
          229, 15820,    15,  5027,     7,    63,    29, 19712,   266,
        30410,    15,  5222,  6367,     5,  4098,  6199,     3,    15,
            7,   311, 30001,    49,     6,  1301,   292,    67,  7251,
         8219,    29,     6,   266, 11068, 14449,   266,     7,  5027,
           15,     7,    16,   645, 16933,   170,  8837,     6,    67,
          292,   311, 19163,     6,    16,  1197,    52, 24054,    29,
        16933,    58]], dtype=int32)>

In [35]:
tokenizer.decode(summaries[0])

'<pad> In einer Welt, in der wir viele Dokumente lesen und verstehen müssen, ist automatische Textsynthese eine offensichtliche Forderung. Aber wäre es nicht nützlicher, wenn Sie die Möglichkeit hätten, eine Zusammenfassung eines Textes in einer Sprache zu bekommen, die Sie nicht verstehen, in Ihrer gewünschten Sprache?'