# Text Summary with T5 from Huggingface Pytorch

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from pathlib import Path
import torch
import re
import time

In [2]:
BATCH_SIZE = 16

SHUFFEL_SIZE = 1024

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

learning_rate = 3e-5

In [3]:
print(device)

cuda:0


## Define Model

In [30]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# model_size = "t5-small"
model_size = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_size)
model = T5ForConditionalGeneration.from_pretrained(model_size).to(device)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("translation_en_to_de", {}))
    

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate, weight_decay=0.0001)

In [49]:
task_specific_params.get("translation_en_to_de", {}).get("max_length"), model.config.prefix

(300, 'translate English to German: ')

In [50]:
model.config

T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "early_stopping": true,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 300,
  "model_type": "t5",
  "n_positions": 512,
  "num_beams": 4,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "prefix": "translate English to German: ",
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      

## Define Pytorch Dataset

In [5]:
def read_files(name):
    article_path = "data/%s/article" % name
    highlights_path = "data/%s/highlights" % name
    
    articles = [x.rstrip() for x in open(article_path).readlines()]
    highlights = [x.rstrip() for x in open(highlights_path).readlines()]
    
    assert len(articles) == len(highlights)
    return articles, highlights

In [103]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, articles):
        self.x = articles
        
    def __getitem__(self, index):
        x = tokenizer.encode_plus(model.config.prefix + self.transfrom(self.x[index]), max_length=300, return_tensors="pt", pad_to_max_length=True)
        return x['input_ids'].view(-1), x['attention_mask'].view(-1)
    
    @staticmethod
    def transfrom(x):
        x = x.lower()
        x = re.sub("'(.*)'", r"\1", x)
        return x
    
    def __len__(self):
        return len(self.x)

In [104]:
from segtok.segmenter import split_single
def split_in_sentences(text):
    return split_single(text)

In [114]:
def get_dataset(name):
    articles, highlights = read_files(name)
    
    a_sent = split_in_sentences(articles[0]) 
    return MyDataset(a_sent)

In [106]:
train_ds = get_dataset("train")
# test_ds = get_dataset("test")
# val_ds = get_dataset("val")

7 By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October.


In [107]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=5)
# val_loader = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE)
# test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE)

In [110]:
x, x_mask = next(iter(train_loader))

x

tensor([[13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0],
        [13959,  1566,    12,  ...,     0,     0,     0]])

In [111]:
x = x.to(device)
x_mask = x_mask.to(device)

In [112]:
translations = model.generate(input_ids=x, attention_mask=x_mask)

In [113]:
pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in translations]
real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in x]
predictions = []   
for pred_sent, real_sent in zip(pred, real):
    predictions.append(str("German: " + pred_sent + "\n\nEnglish: " + real_sent))
    
for item in predictions:
    print(item)
    print("\n------------\n")

German: by . associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october.

English: translate English to German: by . associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october.

------------

German: Das staatliche Gesundheitsministerium hat ein Gutachten über die Exposition für alle veröffentlicht, die fünf Kirchen besuchten und Kommunion nahmen.

English: translate English to German: the state health department has issued an advisory of exposure for anyone who attended five churches and took

## Load Ger/Eng Dataset

In [54]:
def open_translation_file():
    path = "../data/translate_en_de/"
    en_file = open(str(path + "newstest2014.en"), "r")
    de_file = open(str(path + "newstest2014.de"), "r")
    
    en_list = []
    for line in en_file:
        en_list.append(line)
    de_list = []
    for line in de_file:    
        de_list.append(line)
    return en_list, de_list

en, de = open_translation_file()

In [58]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, en, de):
        self.x = en
        self.y = de
        
    def __getitem__(self, index):
        x = tokenizer.encode_plus(model.config.prefix + self.transfrom(self.x[index]), max_length=300, return_tensors="pt", pad_to_max_length=True)
        y = tokenizer.encode(self.transfrom(self.y[index]), max_length=300, return_tensors="pt", pad_to_max_length=True)
        return x['input_ids'].view(-1), x['attention_mask'].view(-1), y.view(-1)
    
    @staticmethod
    def transfrom(x):
        x = x.lower()
        x = re.sub("'(.*)'", r"\1", x)
        return x
    
    def __len__(self):
        return len(self.x)

In [59]:
translation_dataset = MyDataset(en, de)
translation_loader = torch.utils.data.DataLoader(translation_dataset, batch_size=BATCH_SIZE)

In [60]:
x, x_mask, _ = next(iter(translation_loader))

In [67]:
x.shape

torch.Size([16, 300])

In [61]:
x = x.to(device)
x_mask = x_mask.to(device)

In [62]:
translations = model.generate(input_ids=x, attention_mask=x_mask)

In [64]:
pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in translations]
real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in x]
predictions = []   
for pred_sent, real_sent in zip(pred, real):
    predictions.append(str("German: " + pred_sent + "\n\nEnglish: " + real_sent))
    
for item in predictions:
    print(item)
    print("\n------------\n")

German: orlando bloom und miranda kerr lieben sich immer noch.

English: translate English to German: orlando bloom and miranda kerr still love each other

------------

German: Schauspieler orlando bloom und Model miranda kerr wollen getrennte Wege gehen .

English: translate English to German: actors orlando bloom and model miranda kerr want to go their separate ways .

------------

German: kerr ist ein US-amerikanischer Schauspieler .

English: translate English to German: however , in an interview , bloom has said that he and kerr still love each other .

------------

German: miranda kerr und orlando bloom sind Eltern von zwei ##at##-##at## Jahr ##at##-##at## alten flynn .

English: translate English to German: miranda kerr and orlando bloom are parents to two ##at##-##at## year ##at##-##at## old flynn .

------------

German: orlando bloom kündigte seine Trennung von seiner Frau, Supermodel miranda kerr an.

English: translate English to German: actor orlando bloom announced his