In [38]:
import pickle

import numpy as np

import torch

import tensorflow_datasets as tfds
import tensorflow as tf

import sentencepiece as spm

from flair.data import Sentence
from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings
from segtok.segmenter import split_single

# Embedding Model

In [2]:
albert = BertEmbeddings(bert_model_or_path="albert-base-v2")

albert_embedding = DocumentPoolEmbeddings([albert])

In [3]:
sent = Sentence("Berlin and Munich are nice cities .")
albert_embedding.embed(sent)

embedd_result = sent.get_embedding()
print(embedd_result.shape)
print(embedd_result)

torch.Size([3072])
tensor([-0.6863, -0.5820,  1.0685,  ...,  0.7118,  0.6721,  0.5402],
       device='cuda:0', grad_fn=<CatBackward>)


# Dataset

In [4]:
cnn_dailymail = tfds.load(name="cnn_dailymail")

INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Overwrite dataset info from restored data version.
INFO:absl:Reusing dataset cnn_dailymail (/home/yannik/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0)
INFO:absl:Constructing tf.data.Dataset for split None, from /home/yannik/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0


In [7]:
train_tds = cnn_dailymail['train']
test_tds = cnn_dailymail['test']
val_tds = cnn_dailymail['validation']

## Prepare Dataset
- for faster training, we will clean the data, compute the Albert-Base Embedding of all articles and save it to Files, so that we don't have to do it while training

In [8]:
def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text


def map_func(features):
    article_text = normalize_text(features["article"])
    highlights_text = normalize_text(features['highlights'])
    
    return article_text.numpy().decode('UTF-8'), highlights_text.numpy().decode('UTF-8')
        

In [9]:
def get_embedding_of_article(article, i):
    list_embedding = []
    
    sentences = split_single(article)
    for j, sentence in enumerate(sentences):
        # cuts to long sentences of 
        if len(sentence) > 750:
            sentence = sentence[:750]
            
        if len(sentence) > 1:
            sent = Sentence(sentence)

            albert_embedding.embed(sent)
            x = sent.get_embedding()
            x = x.to('cpu').detach().numpy()
            list_embedding.append(x)
    return list_embedding

In [10]:
def embedd_ds(ds):
    new_ds = []
    for i, item in enumerate(ds):
        article, higlights = map_func(item)
        x = get_embedding_of_article(article, i)
        new_ds.append({"article": x, "article_text": article, "highlights": higlights})
    return new_ds

In [8]:
def save_file(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

        
def load_file(filename):  
    with open(filename, "rb") as f:
        return pickle.load(f)

In [9]:
test_ds = embedd_ds(test_tds)
save_file(test_ds, "test.pkl")

val_ds = embedd_ds(val_tds)
save_file(val_ds, "val.pkl")

NameError: name 'embedd_ds' is not defined

In [10]:
# train_ds = embedd_ds(train_tds)
# save_file(train_ds, "train.pkl")

In [20]:
train_raw = load_file("val.pkl")
test_raw = load_file("test.pkl")
print(len(train), train[0])

13368 [list([array([ 0.00387947, -0.6366544 ,  0.21970668, ...,  0.30173776,
        0.43474218,  0.42230588], dtype=float32), array([-0.23565385, -0.659557  ,  0.55651265, ...,  0.33840343,
        0.25438353,  0.26119417], dtype=float32), array([-0.31036785, -0.07266833,  0.42479414, ...,  0.11688844,
        0.4397149 ,  0.7277298 ], dtype=float32), array([ 0.09201267, -0.44499242,  0.3602318 , ...,  0.15546696,
        0.27280316,  0.11705698], dtype=float32), array([-0.2923365 , -0.36915168,  0.52738214, ...,  0.0649092 ,
        0.25962955,  0.14565222], dtype=float32), array([ 0.0709545 , -0.17259236,  0.8320757 , ..., -0.06098118,
        0.24208799,  0.09772751], dtype=float32), array([-0.11589033,  0.35133335,  0.81095463, ...,  0.03234402,
        0.34122986,  0.13773124], dtype=float32), array([ 0.0746538 , -0.13619456,  0.7168364 , ...,  0.0407102 ,
        0.2888261 ,  0.3364408 ], dtype=float32), array([-0.14699861, -0.35566977,  0.88771176, ...,  0.03221212,
        0.0

In [43]:
def convert_to_np(ds):
    list_ds = []
    for item in ds:
        x = [np.array(item['article']), item['article_text'], item['highlights']]
        print(x)
        list_ds.append(np.array(x))
        sp_model = spm.SentencePieceProcessor()
        vocab = {sp_model.IdToPiece(i): i for i
                    in range(sp_model.GetPieceSize())}
        print(vocab)
        break
    return np.array(list_ds)

train = convert_to_np(train_raw)
print(train.shape)

[array([[ 0.00387947, -0.6366544 ,  0.21970668, ...,  0.30173776,
         0.43474218,  0.42230588],
       [-0.23565385, -0.659557  ,  0.55651265, ...,  0.33840343,
         0.25438353,  0.26119417],
       [-0.31036785, -0.07266833,  0.42479414, ...,  0.11688844,
         0.4397149 ,  0.7277298 ],
       ...,
       [ 0.21558289, -0.78932345,  0.49542183, ..., -0.11635831,
         0.3304542 ,  0.5526388 ],
       [-0.13048227, -0.4027023 ,  0.57894194, ...,  0.03963578,
         0.40740883,  0.912753  ],
       [-0.06654014, -0.3630797 ,  0.8600646 , ...,  0.18303427,
         0.1893257 ,  0.3632384 ]], dtype=float32), "sally forrest, an actress-dancer who graced the silver screen throughout the 40s and '50s in mgm musicals and films such as the 1956 noir while the city sleeps died on march 15 at her home in beverly hills, california. forrest, whose birth name was katherine feeney, was 86 and had long battled cancer. her publicist, judith goffin, announced the news thursday. scroll 

In [None]:
test = convert_to_np(test_raw)

In [None]:
print(train[:,0])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, ds, transform=None):
        self.x_embed = torch.from_numpy(ds[:,0])
        self.x_text = torch.from_numpy(ds[:,1])
        self.y = torch.from_numpy(ds[:,2])
        self.transform = transform

    def __getitem__(self, index):
        x_embed = self.x_embed[index]
        x_text = self.x_text[index]
        y = self.y[index]
        

        if self.transform:
            x_embed = self.transform(x_embed)
            x_text = self.transform(x_text)
            
        return x_embed, x_text, y

    def __len__(self):
        return len(self.x_embed)

In [None]:
train = MyDataset(train)
test = MyDataset(test)

In [None]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
!pip3 install sentencepiece


In [33]:
print(train[:,0])

[array([[ 0.00387947, -0.6366544 ,  0.21970668, ...,  0.30173776,
         0.43474218,  0.42230588],
       [-0.23565385, -0.659557  ,  0.55651265, ...,  0.33840343,
         0.25438353,  0.26119417],
       [-0.31036785, -0.07266833,  0.42479414, ...,  0.11688844,
         0.4397149 ,  0.7277298 ],
       ...,
       [ 0.21558289, -0.78932345,  0.49542183, ..., -0.11635831,
         0.3304542 ,  0.5526388 ],
       [-0.13048227, -0.4027023 ,  0.57894194, ...,  0.03963578,
         0.40740883,  0.912753  ],
       [-0.06654014, -0.3630797 ,  0.8600646 , ...,  0.18303427,
         0.1893257 ,  0.3632384 ]], dtype=float32)]


In [34]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, ds, transform=None):
        self.x_embed = torch.from_numpy(ds[:,0])
        self.x_text = torch.from_numpy(ds[:,1])
        self.y = torch.from_numpy(ds[:,2])
        self.transform = transform

    def __getitem__(self, index):
        x_embed = self.x_embed[index]
        x_text = self.x_text[index]
        y = self.y[index]
        

        if self.transform:
            x_embed = self.transform(x_embed)
            x_text = self.transform(x_text)
            
        return x_embed, x_text, y

    def __len__(self):
        return len(self.x_embed)

In [35]:
train = MyDataset(train)
test = MyDataset(test)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [None]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [37]:
!pip3 install sentencepiece


