In [1]:
import os
import pickle

import numpy as np

import torch

import tensorflow_datasets as tfds
import tensorflow as tf

import sentencepiece as spm

from flair.data import Sentence
from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings
from segtok.segmenter import split_single

In [None]:
# Analyse Data

In [2]:
# params
MAX_LEN_ARTICLE = 750

MAX_LEN_HIGHLIGHTS = 150

# Embedding Model

In [3]:
albert = BertEmbeddings(bert_model_or_path="albert-base-v2")

albert_embedding = DocumentPoolEmbeddings([albert])

In [4]:
sent = Sentence("Berlin and Munich are nice cities .")
albert_embedding.embed(sent)

embedd_result = sent.get_embedding()
print(embedd_result.shape)
print(embedd_result)

torch.Size([3072])
tensor([-0.6863, -0.5820,  1.0685,  ...,  0.7118,  0.6721,  0.5402],
       device='cuda:0', grad_fn=<CatBackward>)


# Dataset

In [5]:
cnn_dailymail = tfds.load(name="cnn_dailymail")

INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset cnn_dailymail (/home/yannik/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /home/yannik/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0


In [6]:
train_tds = cnn_dailymail['train']
test_tds = cnn_dailymail['test']
val_tds = cnn_dailymail['validation']

## Sentence Piece
- https://github.com/google/sentencepiece
- use a Vocab size like Albert and BERT (30000) 

In [7]:
# train Sentence Piece with train.tsv

spm_model_name = "models/spm_train.model"
spm_train_file_name = "data/train.tsv"

if not os.path.exists(spm_model_name):
    spm.SentencePieceTrainer.Train(
            '--input=' + os.path.join(spm_train_file_name) +
            ' --model_prefix='+ os.path.join(spm_model_name) +
            ' --vocab_size=30000')

In [8]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load(spm_model_name)
vocab = {sp_model.IdToPiece(i): i for i in range(sp_model.GetPieceSize())}
vocab_list = list(vocab.keys())
print("vocab len:", len(vocab), "\nTop 15: vocabs:", vocab_list[:15])

vocab len: 30000 
Top 15: vocabs: ['<unk>', '<s>', '</s>', '▁the', ',', '.', '▁to', '▁a', 's', '▁of', '▁and', '▁in', '▁.', "'", '▁was']


In [46]:
def get_real_text_from_ids(id_array):
    text = ""
    for sentenece_array in id_array:
        for token in sentenece_array:
            word = vocab_list[token]
            text += word.replace("▁", " ")
    return text

def get_sentence_pieces(sentences):
    sentences = split_single(sentences)
    encodedtext = []
    split_sentences = []
    for sentence in sentences:
        # cut of to long sentences
        if len(sentence) > MAX_LEN_ARTICLE:
            sentence = sentence[:MAX_LEN_ARTICLE]
            
        if len(sentence) > 0:
            encodedtext.append(sp_model.encode_as_ids(sentence))
            split_sentences.append(sentence)
    return encodedtext, split_sentences

test = "hallo, i'm leaving"
tokens = sp_model.encode_as_ids(test)
print(tokens)
get_real_text_from_ids([tokens])

[1429, 292, 4, 47, 13, 108, 851]


" hallo, i'm leaving"

## Prepare Dataset
- for faster training, we will clean the data, compute the Albert-Base Embedding of all articles and save it to Files, so that we don't have to do it while training

In [10]:
def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text


def map_func(features):
    article_text = normalize_text(features["article"])
    highlights_text = normalize_text(features['highlights'])
    
    return article_text.numpy().decode('UTF-8'), highlights_text.numpy().decode('UTF-8')
        

In [28]:
def get_embedding_of_article(sentences):
    list_embedding = []
    for i, sentence in enumerate(sentences):
        
        sent = Sentence(sentence)

        albert_embedding.embed(sent)
        x = sent.get_embedding()
        x = x.to('cpu').detach().numpy()
        list_embedding.append(x)
    return list_embedding

In [11]:
def embedd_ds(ds):
    new_ds = []
    for i, item in enumerate(ds):
        article, higlights = map_func(item)
        articles_ids, sentences_articles = get_sentence_pieces(article) 
        higlights_ids, _ = get_sentence_pieces(higlights) 
        x = get_embedding_of_article(sentences_articles)
        new_ds.append({"article_embed": x, "articles": articles_ids, "highlights": higlights_ids})
    return new_ds

In [13]:
def save_file(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

        
def load_file(filename):  
    with open(filename, "rb") as f:
        return pickle.load(f)

In [10]:
test_ds = embedd_ds(test_tds)
save_file(test_ds, "data/test.pkl")

val_ds = embedd_ds(val_tds)
save_file(val_ds, "data/val.pkl")

NameError: name 'embedd_ds' is not defined

In [10]:
# train_ds = embedd_ds(train_tds)
# save_file(train_ds, "train.pkl")

In [15]:
train_raw = load_file("data/val.pkl")
test_raw = load_file("data/test.pkl")
print(len(train_raw), train_raw[0])

13368 {'article_embed': [array([ 0.00387947, -0.6366544 ,  0.21970668, ...,  0.30173776,
        0.43474218,  0.42230588], dtype=float32), array([-0.23565385, -0.659557  ,  0.55651265, ...,  0.33840343,
        0.25438353,  0.26119417], dtype=float32), array([-0.31036785, -0.07266833,  0.42479414, ...,  0.11688844,
        0.4397149 ,  0.7277298 ], dtype=float32), array([ 0.09201267, -0.44499242,  0.3602318 , ...,  0.15546696,
        0.27280316,  0.11705698], dtype=float32), array([-0.2923365 , -0.36915168,  0.52738214, ...,  0.0649092 ,
        0.25962955,  0.14565222], dtype=float32), array([ 0.0709545 , -0.17259236,  0.8320757 , ..., -0.06098118,
        0.24208799,  0.09772751], dtype=float32), array([-0.11589033,  0.35133335,  0.81095463, ...,  0.03234402,
        0.34122986,  0.13773124], dtype=float32), array([ 0.0746538 , -0.13619456,  0.7168364 , ...,  0.0407102 ,
        0.2888261 ,  0.3364408 ], dtype=float32), array([-0.14699861, -0.35566977,  0.88771176, ...,  0.03221212,

In [17]:
x = train_raw[0]['articles']
get_real_text_from_ids(x)

" sally forrest, an actress-dancer who graced the silver screen throughout the 40s and '50s in mgm musicals and films such as the 1956 noir while the city sleeps died on march 15 at her home in beverly hills, california. forrest, whose birth name was katherine feeney, was 86 and had long battled cancer. her publicist, judith goffin, announced the news thursday. scroll down for video . actress: sally forrest was in the 1951 ida lupino-directed film 'hard, fast and beautiful' (left) and the 1956 fritz lang movie 'while the city sleeps' a san diego native, forrest became a protege of hollywood trailblazer ida lupino, who cast her in starring roles in films including the critical and commercial success not wanted, never fear and hard, fast and beautiful. some of forrests other film credits included bannerline, son of sinbad, and excuse my dust, according to her imdb page. the page also indicates forrest was in multiple climax<unk> and rawhide television episodes. forrest appeared as hersel

### Get Max Count Sentences

In [20]:
def get_shapes_of_ds(ds):
    list_shape_1 = []
    list_shape_2 = []
    list_shape_3 = []
    
    for item in ds:
        x = [np.array(item['article_embed']), np.array(item['articles']), np.array(item['highlights'])]
        try:
            list_shape_1.append(x[0].shape[0])
            list_shape_2.append(x[1].shape[0])
            list_shape_3.append(x[2].shape[0])
        except ValueError as err:
            print(err)
    return np.array(list_shape_1), np.array(list_shape_2), np.array(list_shape_3)

x,y,z = get_shapes_of_ds(train_raw)

In [32]:
print("item, mean, median, min, max")
print("article:",np.mean(x), np.median(x), np.min(x), np.max(x))
print("article_ids:",np.mean(y), np.median(y), np.min(y), np.max(y))
print("highlights:",np.mean(z), np.median(z), np.min(z), np.max(z))

item, mean, median, min, max
article: 25.692848593656493 23.0 2 126
article_ids: 25.692848593656493 23.0 2 126
highlights: 4.151929982046679 4.0 1 79


In [None]:
def get_shapes_of_ds(ds):
    list_shape_1 = []
    list_shape_2 = []
    list_shape_3 = []
    
    for item in ds:
        x = [np.array(item['article_embed']), np.array(item['articles']), np.array(item['highlights'])]
        try:
            list_shape_1.append(x[0].shape[0])
            list_shape_2.append(x[1].shape[0])
            list_shape_3.append(x[2].shape[0])
        except ValueError as err:
            print(err)
    return np.array(list_shape_1), np.array(list_shape_2), np.array(list_shape_3)

x,y,z = get_shapes_of_ds(train_raw)

### Get Max Sentence length

In [30]:
def convert_to_np(ds):
    list_ds = []
    for item in ds:
        x = [np.array(item['article_embed']), np.array(item['articles']), np.array(item['highlights'])]
        try:
            print(x[1].shape)
            print(x[2].shape)
            print(x[1])
            list_ds.append(np.array(x))
        except ValueError as err:
            print(err)
#             print(x[0].shape)
#             print(x[1].shape)
#             print(x[2].shape)
#         break
    return np.array(list_ds)

train = convert_to_np(train_raw)
print("train shape", train.shape)
print(train[0][0].shape, train[0][1].shape, train[0][2].shape)

(13, 3072)
(13,)
(3,)
[list([8071, 11843, 4, 40, 1827, 16, 19386, 365, 44, 3872, 115, 3, 2352, 1962, 1613, 3, 898, 8, 10, 27, 3376, 8, 11, 18414, 5105, 8, 10, 2813, 227, 29, 3, 18166, 96, 2759, 121, 3, 149, 1823, 8, 248, 17, 482, 413, 26, 35, 104, 11, 9201, 4050, 4, 686, 5])
 list([11843, 4, 887, 1128, 499, 14, 8631, 2943, 2962, 4, 14, 6932, 10, 43, 238, 6034, 670, 5])
 list([35, 10065, 4, 17775, 29453, 4, 690, 3, 265, 349, 5])
 list([370, 106, 15, 151, 12, 1827, 25, 8071, 11843, 14, 11, 3, 21587, 47, 1668, 1864, 532, 4421, 16, 12476, 102, 605, 27, 7814, 4, 1451, 10, 1886, 13, 70, 581, 63, 10, 3, 18166, 28567, 6574, 1388, 27, 4546, 3, 149, 1823, 8, 13, 7, 858, 2068, 2684, 4, 11843, 584, 7, 1565, 53, 20025, 9, 2104, 3981, 8202, 6762, 47, 1668, 1864, 532, 4421, 4, 44, 2108, 35, 11, 4939, 5202, 11, 2813, 214, 3, 2166, 10, 2265, 1277, 48, 474, 4, 286, 1584, 10, 612, 4, 1451, 10, 1886, 5])
 list([122, 9, 11843, 8, 113, 605, 9265, 1048, 7751, 2207, 4, 304, 9, 5913, 7721, 4, 10, 7249, 125, 46

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(40, 3072)
(40,)
(4,)
[list([11552, 562, 2891, 108, 70, 4155, 735, 7, 1018, 10021, 26, 9595, 4813, 4, 24, 3, 259, 16, 18480, 102, 1189, 1176, 1451, 16, 9465, 1052, 187, 361, 6, 505, 23984, 3, 4100, 444, 5438, 11, 3, 2192, 347, 12, 3, 255, 1189, 1176, 1451, 16, 9465, 1052, 187, 48, 10999, 4535, 11, 1428, 11643, 133, 1002, 120, 11193, 444, 26, 3962, 11, 3818, 3512, 8, 10, 9418, 2346, 859, 4, 7, 65, 654, 30, 120, 5])
 list([11, 3939, 7, 18134, 20036, 2192, 7, 5357, 9525, 16, 3974, 347, 255, 9, 8269, 223, 19, 7074, 21, 206, 4535, 2338, 10, 113, 327, 886, 5])
 list([3, 1052, 271, 82, 1611, 11, 255, 1189, 1176, 10, 7636, 3, 1115, 73, 7992, 9, 28245, 1451, 16, 9465, 3962, 5])
 list([336, 4, 3, 420, 4, 586, 6, 34, 3, 86, 1481, 9, 103, 1032, 28, 7, 641, 149, 6, 1873, 264, 327, 4, 187, 48, 1052, 65, 1451, 444, 3962, 11, 3818, 3512, 8, 5])
 list([52, 132, 22, 746, 15, 11193, 444, 6, 779, 438, 103, 183, 82, 3, 908, 16, 14171, 11643, 5])
 list([11552, 562, 2891, 108, 4, 7, 1018, 10021, 26, 9595, 48

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(18, 3072)
(18,)
(4,)
[list([9370, 18, 9627, 6, 463, 3, 86, 2170, 6, 295, 7, 94, 16, 3973, 62, 24992, 750, 45, 17, 1053, 45, 8, 1951, 910, 58, 30, 46, 5397, 28, 113, 9487, 5])
 list([1382, 426, 24, 925, 45, 8, 1398, 2030, 2170, 54, 5111, 2239, 196, 22377, 393, 16, 8674, 112, 7, 2384, 4876, 750, 10, 7486, 11, 542, 4, 954, 10, 4169, 5])
 list([7, 9370, 3147, 85, 1759, 978, 19, 40, 2360, 54, 163, 251, 2879, 5])
 list([22377, 393, 16, 8674, 112, 7, 2384, 4876, 750, 30, 463, 7, 5379, 9, 25693, 264, 1449, 10, 1053, 8, 2806, 12, 3, 3147, 24, 25, 27, 13844, 3180, 563, 48, 947, 2155, 73, 6308, 11, 25838, 73, 3804, 5])
 list([336, 4, 47, 77, 2602, 64, 37, 1456, 7, 782, 1182, 11, 22377, 393, 2879, 4, 405, 240, 482, 126])
 list([22377, 393, 16, 8674, 112, 7, 2384, 4876, 750, 30, 4611, 46, 198, 150, 22, 1243, 160, 92, 358, 4, 10, 30, 463, 7, 5379, 9, 6348, 197, 1449, 10, 1053, 45, 8, 2806, 5])
 list([22, 14, 1220, 26, 7, 701, 9, 24696, 425, 10, 2393, 410, 11, 482, 2721, 185, 642, 96, 1920, 49, 1091

In [64]:
test = convert_to_np(test_raw)

could not broadcast input array from shape (3,3072) into shape (3)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (7,3072) into shape (7)
could not broadcast input array from shape (6,3072) into shape (6)
could not broadcast input array from shape (5,3072) into shape (5)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (3,3072) into shape (3)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (3,3072) into shape (3)
could not broadcast input array from shape (6,3072) into shape (6)
could not broadcast input array from shape (5,3072) into shape (5)
could not broadcast input array from shape (4,3072) into shape (4)
could not broadcast input array from shape (4,3072) into shape

In [74]:
print(np.hstack(train[:,1][0]))

[ 8071 11843     4    40  1827    16 19386   365    44  3872   115     3
  2352  1962  1613     3   898     8    10    27  3376     8    11 18414
  5105     8    10  2813   227    29     3 18166    96  2759   121     3
   149  1823     8   248    17   482   413    26    35   104    11  9201
  4050     4   686     5 11843     4   887  1128   499    14  8631  2943
  2962     4    14  6932    10    43   238  6034   670     5    35 10065
     4 17775 29453     4   690     3   265   349     5   370   106    15
   151    12  1827    25  8071 11843    14    11     3 21587    47  1668
  1864   532  4421    16 12476   102   605    27  7814     4  1451    10
  1886    13    70   581    63    10     3 18166 28567  6574  1388    27
  4546     3   149  1823     8    13     7   858  2068  2684     4 11843
   584     7  1565    53 20025     9  2104  3981  8202  6762    47  1668
  1864   532  4421     4    44  2108    35    11  4939  5202    11  2813
   214     3  2166    10  2265  1277    48   474   

In [66]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, ds, transform=None):
        self.x_embed = torch.from_numpy(ds[:,0])
        self.x_text = torch.from_numpy(ds[:,1])
        self.y = torch.from_numpy(ds[:,2])
        self.transform = transform

    def __getitem__(self, index):
        x_embed = self.x_embed[index]
        x_text = self.x_text[index]
        y = self.y[index]
        

        if self.transform:
            x_embed = self.transform(x_embed)
            x_text = self.transform(x_text)
            
        return x_embed, x_text, y

    def __len__(self):
        return len(self.x_embed)

In [67]:
train = MyDataset(train)
test = MyDataset(test)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [None]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [33]:
print(train[:,0])

[array([[ 0.00387947, -0.6366544 ,  0.21970668, ...,  0.30173776,
         0.43474218,  0.42230588],
       [-0.23565385, -0.659557  ,  0.55651265, ...,  0.33840343,
         0.25438353,  0.26119417],
       [-0.31036785, -0.07266833,  0.42479414, ...,  0.11688844,
         0.4397149 ,  0.7277298 ],
       ...,
       [ 0.21558289, -0.78932345,  0.49542183, ..., -0.11635831,
         0.3304542 ,  0.5526388 ],
       [-0.13048227, -0.4027023 ,  0.57894194, ...,  0.03963578,
         0.40740883,  0.912753  ],
       [-0.06654014, -0.3630797 ,  0.8600646 , ...,  0.18303427,
         0.1893257 ,  0.3632384 ]], dtype=float32)]


In [34]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, ds, transform=None):
        self.x_embed = torch.from_numpy(ds[:,0])
        self.x_text = torch.from_numpy(ds[:,1])
        self.y = torch.from_numpy(ds[:,2])
        self.transform = transform

    def __getitem__(self, index):
        x_embed = self.x_embed[index]
        x_text = self.x_text[index]
        y = self.y[index]
        

        if self.transform:
            x_embed = self.transform(x_embed)
            x_text = self.transform(x_text)
            
        return x_embed, x_text, y

    def __len__(self):
        return len(self.x_embed)

In [35]:
train = MyDataset(train)
test = MyDataset(test)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [None]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)