In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from albert.my_sentence_piecer import MySentencePiecer
from albert.albert_pre import AlbertPre
from albert.tf_to_csv import TfToCsv

In [5]:
torch.cuda.current_device(), torch.cuda.get_device_name(device=None)

(0, 'GeForce RTX 2080 Ti')

# Params

In [6]:
MAX_SENT_N = 30

MAX_WORD_N = 150

MAX_WORD_SENT_N = 300

BATCHSIZE = 20

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Data Processing

## Sentence Piecer

In [2]:
sentence_piecer = MySentencePiecer(vocab_size=10000, force_update=True)

In [4]:
print(sentence_piecer.vocab_list[:20])
test = "hallo, i'm leaving. this is another sentences."
tokens = sentence_piecer.get_ids_from_vocab(test)
print(tokens)
print(sentence_piecer.get_real_text_from_ids(tokens))

['<unk>', '<s>', '</s>', '▁the', ',', '.', '▁to', '▁a', 's', '▁of', '▁and', '▁in', '▁.', "'", '▁was', '▁for', '-', '▁on', '▁is', '▁that']
[1429, 292, 4, 47, 13, 108, 851, 5, 52, 18, 195, 5435, 5, 2]
 hallo, i'm leaving. this is another sentences.</s>


In [10]:
albert_pre = AlbertPre()


## Dataset

In [11]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, article, n_highlights, highlights,transform=None):
        self.x = self.to_tensor_list(article, dtype=torch.float)

        self.y_n = torch.tensor(n_highlights, dtype=torch.long)
        self.y = self.to_tensor_list(highlights, dtype=torch.long, pad=MAX_WORD_N)

    def __getitem__(self, index):
        x = self.x[index]
        y_n = self.y_n[index]
        y = self.y[index]

        return x, y_n, y

    @staticmethod
    def to_tensor_list(x, dtype, pad=None):

        if pad is None:
            tensor_list = [torch.tensor(x_i, dtype=dtype) for x_i in x]
        else:
            tensor_list = [torch.cat((torch.tensor(x_i[:MAX_WORD_N], dtype=dtype), \
                                      torch.zeros(pad - x_i[:MAX_WORD_N].shape[0], dtype=dtype))) for x_i in x]

        return tensor_list

    def __len__(self):
        return len(self.x)

In [12]:
def load_torch_dataset(name):
    x,y,z = albert_pre.load_np_files(name)
    return MyDataset(x,y,z)

test_ds = load_torch_dataset("test")
train_ds = load_torch_dataset("val")

FileNotFoundError: [Errno 2] No such file or directory: '../data/test/n_articles.npy'

In [None]:
BATCHSIZE = 10
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCHSIZE)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCHSIZE)

# My Model