In [2]:
import torch
import gzip
import sentencepiece as spm
from collections import namedtuple

from torch.utils.data import Dataset, DataLoader

In [11]:
class CNN(torch.nn.Module):

    def __init__(self, voc_size, in_channels, out_channels, kernel_size=3, stride=1):
        super(CNN, self).__init__()

        self.embeds = torch.nn.Embedding(voc_size, in_channels) 

        self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride)
        self.fonc = torch.nn.ReLU()
        self.linear = torch.nn.Linear(out_channels, 2)


    def forward(self,seq):
        x = self.embeds(seq)
        x = x.permute((0,2,1))

        x = self.fonc(self.conv(x))

        x = torch.max(x, dim=2)[0].squeeze()

        return self.linear(x)

In [22]:
Batch = namedtuple("Batch", ["text", "labels"])

class TextDataset(torch.utils.data.Dataset):
    
    def __init__(self, text: torch.LongTensor, sizes: torch.LongTensor, labels: torch.LongTensor):
        self.text = text
        self.sizes = sizes
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index: int):
        return self.text[self.sizes[index]:self.sizes[index+1]], self.labels[index].item()

    @staticmethod
    def collate(batch):
        data = [item[0] for item in batch]
        labels = [item[1] for item in batch]
        return Batch(torch.nn.utils.rnn.pad_sequence(data, batch_first=True), torch.LongTensor(labels))

In [23]:
with gzip.open('train-1000.pth') as fp:
    train_ds = torch.load(fp)

In [24]:
train = DataLoader(train_ds, batch_size=100, collate_fn=TextDataset.collate)

In [25]:
cnn = CNN(1000,50,10)

In [32]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)

In [34]:
running_loss = 0.0
for i, (inputs, labels) in enumerate(train):
    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = cnn(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if i % 200 == 199:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %f' %
              (1 + 1, i + 1, running_loss / 2000))
        running_loss = 0.0

[2,   200] loss: 0.000011
[2,   400] loss: 0.000009
[2,   600] loss: 0.000009
[2,   800] loss: 0.000007
[2,  1000] loss: 0.000007
[2,  1200] loss: 0.000005
[2,  1400] loss: 0.000005
[2,  1600] loss: 0.000005
[2,  1800] loss: 0.000004
[2,  2000] loss: 0.000004
[2,  2200] loss: 0.000008


KeyboardInterrupt: 

In [19]:
train

<torch.utils.data.dataloader.DataLoader at 0x7f2fc0267e90>

In [218]:
t = next(iter(train))

In [224]:
embeds = torch.nn.Embedding(1000, 50) 
t[0]

tensor([[ 56, 453,   8,  ...,   0,   0,   0],
        [ 27,  70,   3,  ...,   0,   0,   0],
        [  4, 406,  91,  ...,   0,   0,   0],
        ...,
        [657,   8,   4,  ...,   0,   0,   0],
        [863, 166,  14,  ...,   0,   0,   0],
        [ 56, 287, 186,  ...,   0,   0,   0]])

In [226]:
batch = t[0]
embatch = embeds(batch)

In [236]:
ex = torch.tensor([[4, 161], [20, 95]])

In [238]:
ex
emex = embeds(ex)

In [239]:
emex

tensor([[[ 5.5194e-02, -3.0971e-02, -2.5020e-02,  7.2886e-02, -6.4929e-01,
           2.4296e-01, -9.8907e-01, -4.4139e-01,  2.1070e+00,  1.4871e+00,
          -3.5173e-01,  1.2579e+00, -8.9647e-01,  3.8556e-01, -6.8871e-01,
           1.8117e+00,  5.5743e-01,  1.3918e+00, -1.6131e-01,  1.3127e-01,
          -4.2685e-01, -4.8727e-01,  1.3638e+00,  2.5317e+00, -5.1366e-01,
           1.6517e+00, -1.0289e+00,  1.1382e+00,  4.8713e-01,  3.4250e-01,
          -5.8573e-01,  1.8584e+00, -1.2124e+00, -4.2926e-01,  2.9952e-01,
           7.5592e-01,  1.5582e-01,  3.7930e-02, -7.5514e-01, -6.1178e-01,
           1.1400e+00,  1.4572e+00,  1.6356e+00, -1.0104e-01, -8.2862e-01,
           1.6281e+00,  5.4878e-01,  7.8644e-02,  1.7108e-01,  1.0995e+00],
         [ 7.7085e-01,  4.8239e-01,  1.7930e-01, -8.0880e-02,  1.3383e+00,
           6.9776e-01,  1.2570e+00,  9.3019e-01, -4.9104e-01,  1.2738e-01,
           2.2883e-01, -2.5553e-01, -1.2491e+00,  5.6594e-01, -2.5229e-01,
           7.3076e-01, -

In [243]:
emex.permute([0,2,1])[0]

tensor([[ 0.0552,  0.7709],
        [-0.0310,  0.4824],
        [-0.0250,  0.1793],
        [ 0.0729, -0.0809],
        [-0.6493,  1.3383],
        [ 0.2430,  0.6978],
        [-0.9891,  1.2570],
        [-0.4414,  0.9302],
        [ 2.1070, -0.4910],
        [ 1.4871,  0.1274],
        [-0.3517,  0.2288],
        [ 1.2579, -0.2555],
        [-0.8965, -1.2491],
        [ 0.3856,  0.5659],
        [-0.6887, -0.2523],
        [ 1.8117,  0.7308],
        [ 0.5574, -0.3173],
        [ 1.3918, -2.4146],
        [-0.1613, -0.5281],
        [ 0.1313, -1.0966],
        [-0.4268, -0.3939],
        [-0.4873, -0.8564],
        [ 1.3638, -1.2011],
        [ 2.5317, -0.1060],
        [-0.5137,  2.2496],
        [ 1.6517, -0.1300],
        [-1.0289, -0.8259],
        [ 1.1382, -1.8955],
        [ 0.4871, -0.5330],
        [ 0.3425, -3.1336],
        [-0.5857,  0.8933],
        [ 1.8584, -0.9831],
        [-1.2124,  0.8005],
        [-0.4293, -1.4190],
        [ 0.2995, -0.9463],
        [ 0.7559,  0

In [241]:
emex.shape

torch.Size([2, 2, 50])

In [178]:
next(it)

Batch(text=tensor([[ 17, 456,  63,  ...,   0,   0,   0],
        [  4,  26, 395,  ...,   0,   0,   0],
        [174,  28, 289,  ...,   0,   0,   0],
        ...,
        [ 27,  25, 829,  ...,   0,   0,   0],
        [ 21,  36, 481,  ...,   0,   0,   0],
        [  4, 900,  43,  ...,   0,   0,   0]]), labels=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]))

In [165]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(f"wp1000.model")

True

In [235]:
print(tokenizer.encode_as_pieces("I am you are"))
print(tokenizer.encode_as_ids("I am you are"))
print(tokenizer.decode_ids([4, 115]))

exemple = train_ds[1447652][0].tolist()
print(tokenizer.decode_ids(exemple))

['▁I', '▁am', '▁you', '▁are']
[4, 161, 20, 95]
I love
Thriving Ivory s song quot;Twilight quot; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; gt; the other Twilight


In [187]:
train_ds[1447652]

(tensor([620, 125,  91,   9,   4,  91,  74,  13,  10, 562, 101,  44, 120,  53,
         106, 231, 101,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,
          16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44,
         138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,
          44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,
          16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44,
         138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,
          44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,
          16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44,
         138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,  44, 138,  16,
          44, 138,  16,  44, 138,  16,  44,   7, 571,  88,  53, 106, 231]), 1)

In [213]:
len(train_ds)

1600000

In [179]:
t = [len(x[0]) for x in train_ds]

In [185]:
t.index(153)

1447652

In [52]:
print(tokenizer.decode_ids([0]))


 ⁇ 


In [54]:
#TextDataset.collate(153)



TypeError: 'int' object is not iterable

In [46]:

class CNN(torch.nn.Module):
    
    def __init__(self):
        self(CNN, self).__init__()
        
        self.conv1(,)
        
        
        
    def forward(self):
        pass

CNN

Conv 1D
max pool

Conv 1D

(taille de fenetre diff)