# Cross Lingual summarization CNN Daily Mail Results
We will try out the trained t5 network from the tpu

In [1]:
import tensorflow as tf
import pandas as pd
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import time
from rouge_score import rouge_scorer
from rouge_score import scoring

In [2]:
if not tf.config.list_physical_devices('GPU'):
    print("Change runtime to \"GPU runtime\" for faster computations")

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

## Params

In [4]:
BATCH_SIZE = 8

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-base"

MAX_ARTICLE_LEN = 512

MAX_HIGHLIGHT_LEN = 150

## Model

In [5]:
tokenizer = T5Tokenizer.from_pretrained(model_size)
model = TFT5ForConditionalGeneration.from_pretrained(model_size)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    
pad_token_id = tokenizer.pad_token_id

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [6]:
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

Model: "tf_t5for_conditional_generation"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  24674304  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  84954240  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  113275008 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [7]:
ckpt_file = "../models/checkpoint_cross_lingual.ckpt"
model.load_weights(ckpt_file)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f34b56c4320>

## Dataset
We will load the translated CNN Daily Mail dataset from the tfrecords files

In [8]:
en_de_prefix = tf.reshape(tokenizer.encode("summarize: en_to_ger ", return_tensors="tf"), (-1,))
de_en_prefix = tf.reshape(tokenizer.encode("summarize: ger_to_en ", return_tensors="tf"), (-1,))
en_en_prefix = tf.reshape(tokenizer.encode("summarize: en_to_en ", return_tensors="tf"), (-1,))
de_de_prefix = tf.reshape(tokenizer.encode("summarize: ger_to_ger ", return_tensors="tf"), (-1,))

In [9]:
prefix_length = de_de_prefix.shape[0]
prefix_length

9

In [10]:
import numpy as np
MAX_ARTICLE_LEN = 512
MAX_HIGHLIGHT_LEN = 150
GLOBAL_BATCH_SIZE = 8

def get_tfrecord_dataset(file_name):
    features = {
        'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),

        'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-8], tf.int64),
        'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
    }

    dataset = tf.data.TFRecordDataset(f"../data/{file_name}.tfrecord")

    # Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example


    def select_data_from_record(record):
        return [
            tf.strided_slice(tf.concat([de_de_prefix, record['ger_x']], axis=0), [0], [-1]), tf.concat([tf.ones(8, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'],
            tf.strided_slice(tf.concat([en_de_prefix, record['en_x']], axis=0), [0], [-1]), tf.concat([tf.ones(8, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'], record['ger_y_ids'],
            tf.strided_slice(tf.concat([de_en_prefix, record['ger_x']], axis=0), [0], [-1]), tf.concat([tf.ones(8, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'], record['en_y_ids'],
            tf.strided_slice(tf.concat([en_en_prefix, record['en_x']], axis=0), [0], [-1]), tf.concat([tf.ones(8, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'], record['en_y_ids'],
        ]
    
    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(100)
    return dataset.batch(GLOBAL_BATCH_SIZE)

test_ds = get_tfrecord_dataset("corss_lingual_test_cnn_daily_mail")

In [11]:
def get_summaries(ds):
    for i in range(1,5):
        yield ds[(i-1)*4], ds[i*4-3], ds[i*4-2], ds[i*4-1]


for ds in test_ds.take(1):
    for i in get_summaries(ds):
        print(i[0].shape, i[1].shape, i[2].shape, i[3].shape)

(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)
(8, 512) (8, 512) (8, 150) (8, 150)


## Evaluation
### Define Rouge Score

In [22]:
class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

### Compute Summaries

In [13]:
predictions = []
start_time = time.time()

for i, ds_item in enumerate(test_ds): 
    for (input_ids, input_mask, y, y_ids) in get_summaries(ds_item):
        summaries = model.generate(
            input_ids=input_ids, 
            attention_mask=input_mask, 
            early_stopping=True, 
            max_length=150
        )

        articles = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in input_ids]

        pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
        real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    
        for pred_sent, real_sent, article_sent in zip(pred, real, articles):
            predictions.append(str("article: " + article_sent + "\n\npred sentence: " + pred_sent + "\n\nreal sentence: " + real_sent))
    
    if (i % 10) == 0:
        elapsed = (time.time() - start_time) / 10
        print(i,": time genreate batch:", elapsed)
        start_time = time.time()
    if i > 100:
        # otherwise it will take ages
        break


# rouge_score.result()

0 : time genreate batch: 13.171508717536927
10 : time genreate batch: 151.7758148908615
20 : time genreate batch: 149.52904710769653
30 : time genreate batch: 148.9606782436371
40 : time genreate batch: 162.86957681179047
50 : time genreate batch: 161.67399640083312
60 : time genreate batch: 154.9133793592453
70 : time genreate batch: 150.52307457923888
80 : time genreate batch: 157.7699389219284
90 : time genreate batch: 160.59141280651093
100 : time genreate batch: 161.19146971702577


### Lets have a look at some of these predicted summaries

In [14]:
import numpy as np
len_predictions = len(predictions)

def get_random_prediction():
    return predictions[np.random.randint(len_predictions)]

In [15]:
print(get_random_prediction())

article: summarize: en_to_ger Lazio replaced their fierce city rivals Roma as the leading challengers in the battle to chase down Serie A leaders Juventus by hammering Empoli 4-0 at the Stadio Olimpico. The Biancocelesti took a fourth-minute lead through captain Stefano Mauri with Miroslav Klose adding a second just after the half-hour mark. Antonio Candreva and Felipe Anderson plundered further goals to ensure an eighth straight league win for the Coppa Italia finalists, who now lead Roma by a single point. Miroslav Klose celebrates as he grabs Lazios second goal of the game in the win against Empoli . Lazio midfielder Stefano Mauri celebrates with teammate Antonio Candreva after scoring . The Giallorossi are now 13 points behind Juve after struggling to draw 1-1 at Torino on Sunday. Rudi Garcia's men needed an Alessandro Florenzi penalty to take the lead on 57 minutes, but Maxi Lopez promptly equalised to send Roma towards their ninth draw from 13 games. Napoli's Belgium forward Drie

In [16]:
print(get_random_prediction())

article: summarize: ger_to_ger Chelsea trifft im Juli in einem Freundschaftsspiel auf Paris Saint-Germain, das französische Team, das Jose Mourinho in dieser Saison aus der Champions League warf. Die Blues, die in der Runde der letzten 16 nach einem 2: 2-Unentschieden an der Stamford Bridge auf Auswärtstore gedrängt wurden, spielen am 25. Juli in North Carolina gegen PSG. Es ist eines von drei Spielen, die Mourinhos Mannschaft auf dem Weg zur wahrscheinlichen Titelverteidigung in der Premier League bestreiten wird. John Terry führt die Feierlichkeiten an, als Chelsea dem Premier-League-Titel mit einem 0: 0-Unentschieden bei Arsenal näherkommt. Eden Hazard, der Fußballer des Jahres, wird für Chelsea auflaufen, wenn sie im Sommer in die USA reisen. New York Red Bulls - 22. Juli - New Jersey. Paris Saint-Germain - 25. Juli - Charlotte, North Carolina. Barcelona - 28. Juli - Washington D.C. Fiorentina - 5. August - Stamford Bridge: Chelsea, bei nur noch vier ausstehenden Spielen zehn Punkt

In [17]:
print(get_random_prediction())

article: summarize: en_to_ger Kell Brook could have had the blockbuster fight he craves against Amir Khan instead of treading water against Frankie Gavin, if only his promoter had minded his words. Now that Battle of Britain – potentially at Wembley Stadium – may never take place. So says Khans father Shah as his son gets ready to finally confirm the over-criticised Chris Algieri as his opponent in New York on the same night when Brook will be defending his world welterweight title against Gavin at London's O2 Arena on May 20 . Promoter Eddie Hearn (centre) stands behind Kell Brook (left) and Frankie Gavin at the O2 Arena . Amir Khan celebrates his victory over Devon Alexander at the MGM Grand in Las Vegas last year . Shah Khan, who also supervises affairs, says: 'Kell could have had his chance against Amir right now if Eddie Hearn had toned it down. But instead of being patient he's been so disrespectful. 'If he carries on like that the Brook fight will not take place. He keeps shouti

In [18]:
print(get_random_prediction())

article: summarize: en_to_ger After Tony Blair’s speech in his former constituency of Sedgefield last week, the former PM made an appearance at a low-key private fundraising dinner for 15 Labour target seats. But despite the fact that Blair has a record of three general election victories, only one Shadow Cabinet minister, Chuka Umunna, could be bothered to attend. During a dinner held in an Indian banquet hall in Morden, Blair (pictured last week in Newton Aycliffe, County Durham) name-checked Ed Miliband only once, devoting his speech instead to his own achievements . The dashing Shadow Business Secretary is being mentored by Lord Mandelson, the former Blairite Cabinet minister, who wants to see him as the next Labour leader. The programme for the dinner, held in an Indian banquet hall in less-than-glamorous Morden High Street, South London, was littered with errors and even misspelt Umunna’s name. A clearly uncomfortable Blair name-checked Ed Miliband only once, devoting his speech 

In [19]:
print(get_random_prediction())

article: summarize: ger_to_en Muster, die sowohl im sehr großen als auch im sehr kleinen Maßstab auftreten, sind in der Natur selten. Doch Forscher haben ein derartiges Muster an zwei scheinbar unzusammenhängenden Orten gefunden - Zellen in der menschlichen Haut und die geheimnisvollen Feenkreise in Namibia. Während die Verteilung der Wüstenelfenkreise zufällig aussehen mag, als ob die Landschaft mit Sommersprossen markiert wäre, entspricht das Muster dem Verteilungsmuster der Hautzellen. Forscher haben ein ähnliches Muster an zwei scheinbar voneinander unabhängigen Orten gefunden - Hautzellen und geheimnisvolle Feenkreise in der namibischen Wüste. Die Abbildung auf der linken Seite zeigt die Verteilung der Hautzellen, und die auf der rechten Seite zeigt Feenkreise, die ebenfalls in Polygonen angeordnet sind. "Es ist eine völlig erstaunliche, seltsame Übereinstimmung", sagte Professor Robert Sinclair, Leiter der Abteilung für Mathematische Biologie am Okinawa Institute of Science and T

In [20]:
for i in range(10):
    print(get_random_prediction())

article: summarize: ger_to_en Inter Mailand ist bereit, Yaya Toure einen Fünfjahresvertrag anzubieten, um ihn im Sommer von Manchester City wegzulocken. Trotz der Tatsache, dass Toure im nächsten Monat 32 Jahre alt wird, ist Roberto Mancini so verzweifelt, den starken Mittelfeldspieler zu verpflichten, dass er bereit ist, dem Mittelfeldspieler der Elfenbeinküste einen Vertrag zu geben, der ihn über seinen 37. Geburtstag hinaus spielen lässt. Mancini, der Toure 2010 für Manchester City unter Vertrag nahm, arbeitet hinter den Kulissen hart daran, Toure in der kommenden Saison in die Serie A zu holen. Manchester Citys Mittelfeldakteur Yaya Toure bleibt ein Top-Kandidat für den Serie-A-Giganten Inter Mailand. Toure, im Bild mit Vincent Kompany, erhält im Sommer ein neues Angebot von Manchester City. Der ehemalige Manchester-City-Manager Roberto Mancini möchte Toure unbedingt ins San Siro holen. Der Italiener glaubt, dass Toure die Fähigkeit besitzt, Spiele in der Serie A bis weit in die Mi

## Save results to text file

In [21]:
result_path = "../results/t5_cross_lingual_result.txt"
open(result_path, "w")
for pred in predictions:
    with open(result_path, "a") as file:
        file.write(pred + "\n")

## Load save File

In [23]:
data_points = []
result_path = "../results/t5_cross_lingual_result.txt"
file = open(result_path, "r")
for line in file:
    data_points.append(line)

In [24]:
data_points[4]

'real sentence: Beatrice sah Rennen auf der Terrasse mit dem Kronprinzen des Golfstaates. Mark 13. Urlaub seit November letzten Jahres, und vierte in einem Monat. Prinzessin kündigte ihren Job bei Sony Pictures in New York vor Weihnachten. Trotzdem wird sie als Vollzeit-Arbeiterin auf der Website ihres Vaters beschrieben.\n'

In [25]:
class SummaryData():
    
    def __init__(self):
        self.language_tag = ''
        self.real_data = ''
        self.pred_data = ''        

In [26]:
count = 0
summary_data = SummaryData()
summary_data_list = []
for point in data_points:
    count += 1
    
    if count == 1:
        summary_data.language_tag = point.split(" ")[2]
    elif count == 3:
        summary_data.pred_data = ": ".join(point.split(":")[1:])
    elif count == 5:
        summary_data.real_data = ": ".join(point.split(":")[1:])
        summary_data_list.append(summary_data)
        summary_data = SummaryData()
        count = 0
summary_data_list[0].__dict__

{'language_tag': 'ger_to_ger',
 'real_data': ' Beatrice sah Rennen auf der Terrasse mit dem Kronprinzen des Golfstaates. Mark 13. Urlaub seit November letzten Jahres, und vierte in einem Monat. Prinzessin kündigte ihren Job bei Sony Pictures in New York vor Weihnachten. Trotzdem wird sie als Vollzeit-Arbeiterin auf der Website ihres Vaters beschrieben.\n',
 'pred_data': ' Prinzessin Beatrice wurde beim Großen Preis von Bahrain gesehen. Die 26-Jährige war mit ihrem langjährigen Freund Dave Clark im Golfstaat gesichtet. Beatrice befand sich in der Startaufstellung hinter dem Kronprinzen von Bahrain. Es wird vermutet, dass auch Formel-1-Legende Sir Jackie Stewart und Komiker Rory Bremner im Turm waren.\n'}

In [38]:
from ast import literal_eval

results_en_trans = []
results_en_path = "../results/en_en_results"
file = open(results_en_path, "r")
for i, line in enumerate(file):
    results_en_trans.append(literal_eval(line))
    
results_ger_trans = []
results_ger_path = "../results/en_en_results"
file = open(results_ger_path, "r")
for i, line in enumerate(file):
    results_ger_trans.append(literal_eval(line))

In [39]:
rouge_scores_dict = dict()
rouge_scores_dict['en_to_en'] = RougeScore()
rouge_scores_dict['en_to_en_trans'] = RougeScore()

rouge_scores_dict['en_to_ger'] = RougeScore()
rouge_scores_dict['ger_to_en'] = RougeScore()
rouge_scores_dict['ger_to_ger'] = RougeScore()
rouge_scores_dict['ger_to_ger_trans'] = RougeScore()

rouge_scores_dict

{'en_to_en': <__main__.RougeScore at 0x7fca21ce73c8>,
 'en_to_en_trans': <__main__.RougeScore at 0x7fca22360b00>,
 'en_to_ger': <__main__.RougeScore at 0x7fca2160eba8>,
 'ger_to_en': <__main__.RougeScore at 0x7fca2160efd0>,
 'ger_to_ger': <__main__.RougeScore at 0x7fca2160ef60>,
 'ger_to_ger_trans': <__main__.RougeScore at 0x7fca21621080>}

In [40]:
for summary_data in summary_data_list:
    rouge_scores_dict[summary_data.language_tag](summary_data.real_data, summary_data.pred_data)
    

In [41]:
results_ger_trans[0]

{'id': 24,
 'item_id': 1,
 'language_tag': 'en_to_en',
 'real_data': 'Beatrice seen watching race on terrace with the Gulf states Crown Prince . Marks 13th holiday since November last year, and fourth in a month . Princess quit her job at Sony Pictures in New York before Christmas . Despite that she is described as working full-time on her fathers website .\n',
 'pred_data': 'Princess Beatrice spotted at Bahrain Grand Prix with long-term boyfriend Dave Clark . Onlooker said 26-year-old was walking behind the Crown Prince of Bahrain . Princes regime accused of violently repressing pro-democracy protests . Sir Jackie Stewart and comedian Rory Bremner watched race from the tower .\n',
 'translated': 'Prinzessin Beatrice beim Großen Preis von Bahrain mit ihrem langjährigen Freund Dave Clark.Beobachter sagten, der 26-Jährige sei hinter dem Kronprinzen von Bahrain hergelaufen.Prinzen-Regime beschuldigt, prodemokratische Proteste gewaltsam zu unterdrücken.Sir Jackie Stewart und Komiker Rory B

In [42]:
for item in results_en_trans:
    rouge_scores_dict['en_to_en_trans'](item['translated_real'], item['translated'])

In [43]:
for item in results_ger_trans:
    rouge_scores_dict['ger_to_ger_trans'](item['translated_real'], item['translated'])

In [44]:
for key, rouge_score_item in rouge_scores_dict.items():
    print(key, rouge_score_item.result())
    print()

rouge1 = 41.29, 95% confidence [40.42, 42.07]
rouge2 = 19.44, 95% confidence [18.52, 20.30]
rougeLsum = 38.37, 95% confidence [37.53, 39.24]
en_to_en {'rouge1': 41.28645093377612, 'rouge2': 19.437642045211796, 'rougeLsum': 38.36625534486865}

rouge1 = 33.22, 95% confidence [32.48, 33.98]
rouge2 = 14.29, 95% confidence [13.49, 15.03]
rougeLsum = 23.23, 95% confidence [22.55, 23.90]
en_to_en_trans {'rouge1': 33.22236762439636, 'rouge2': 14.289592087685248, 'rougeLsum': 23.233025232530125}

rouge1 = 32.92, 95% confidence [32.19, 33.60]
rouge2 = 12.93, 95% confidence [12.32, 13.56]
rougeLsum = 22.69, 95% confidence [22.03, 23.32]
en_to_ger {'rouge1': 32.91827353738891, 'rouge2': 12.92944828942739, 'rougeLsum': 22.690616445971596}

rouge1 = 36.06, 95% confidence [35.36, 36.78]
rouge2 = 13.08, 95% confidence [12.48, 13.62]
rougeLsum = 33.49, 95% confidence [32.84, 34.18]
ger_to_en {'rouge1': 36.05542655889138, 'rouge2': 13.076510809934835, 'rougeLsum': 33.491016005290255}

rouge1 = 33.86, 95

# Special Example

In [29]:
input_text = "In a world where we have to read and understand a lot of documents automatic text summarization has an obvious demand. To have the option to get a brief summary of a text in your language can be very useful. But would it not be even more useful, if you could have the option to get a summary of a text in a language, you do not understand, in your wished language? Sometimes we don’t want an exact translation, sometimes we just want to know a brief overview of a text in a language we don’t understand. That is the case where Cross-Lingual summarization would be prefered over normal translation. Cross-Lingual summarization can give you a short overview of a text in a language you do not understand."

In [30]:
input_text_test = "The dramatic growth of data on the internet leads to the need to automatically process and understand the data. A big part of the data is text data in many languages. This overwhelming amount of information causes a demand for automatic text summarization and other Natural Language Processing (NLP) taks.\n The Field of NLP had some high points in the last couple of years, when the field got revolutionized by Neural Language Models. With publications like Attention is all you need \cite{vaswani2017attention} or GPT3\cite{brown2020language} the limits of the field are pushed even further. Because of large data corpuses, scraped from the internet, and advanced models which contain up to 175 Billion parameters like the GPT3 it is possible to generate text, answer questions, summarize text, translate or many other things.\n In this master thesis we will take a deeper look into summarization. There are extractive and abstractive techniques to summarization. The extractive summary technique tries to find subsets of sentences, which representante the original text well and uses them to summarize the original text \cite{allahyari2017text}. The abstractive technique uses advanced language models to generate a new text, which should be much shorter than the original one, contain all the key information and preserve the overall meaning.\n In this master thesis we will use the abstractive technique, because it is closer to a human-like interpretation. It combines the ability to understand what the context of a given text is and the ability to generate fluent and grammatically correct text to that given context.\n The summarization in one language is an interesting topic but cross lingual approach is even more interesting. The goal of Cross Lingual Summarization is to summarize a text from one language into another language. It combines the ability to summarize and the ability to translate. Where state of the art models perform well on normal summarization, it will be interesting to see how well they perform doing Cross-Lingual Summarization."

In [31]:
x = tokenizer.encode_plus("summarize: en_to_ger " + input_text, max_length=512, return_tensors="tf", padding='max_length', truncation=True)
input_ids = tf.reshape(x['input_ids'], (1,-1))
attention_mask = tf.reshape(x['attention_mask'], (1,-1))

In [32]:
print(input_ids.shape, attention_mask.shape)

(1, 512) (1, 512)


In [33]:
summaries = model.generate( 
    input_ids=input_ids, 
    attention_mask=attention_mask
)

In [34]:
summaries

<tf.Tensor: shape=(1, 74), dtype=int32, numpy=
array([[    0,    86,   645,  3779,     6,    16,    74,   558,  2584,
            3, 20127,    15,   110,    35,    64, 19163,  3766,     6,
          229, 15820,    15,  5027,     7,    63,    29, 19712,   266,
        30410,    15,  5222,  6367,     5,  4098,  6199,     3,    15,
            7,   311, 30001,    49,     6,  1301,   292,    67,  7251,
         8219,    29,     6,   266, 11068, 14449,   266,     7,  5027,
           15,     7,    16,   645, 16933,   170,  8837,     6,    67,
          292,   311, 19163,     6,    16,  1197,    52, 24054,    29,
        16933,    58]], dtype=int32)>

In [35]:
tokenizer.decode(summaries[0])

'<pad> In einer Welt, in der wir viele Dokumente lesen und verstehen müssen, ist automatische Textsynthese eine offensichtliche Forderung. Aber wäre es nicht nützlicher, wenn Sie die Möglichkeit hätten, eine Zusammenfassung eines Textes in einer Sprache zu bekommen, die Sie nicht verstehen, in Ihrer gewünschten Sprache?'