In [1]:
import torch as th
import torch.nn as nn
import random
import os
import sys
import nltk
from collections import Counter
import re
import numpy as np

In [86]:
#nltk.download('punkt')

In [32]:
class Model(nn.Module):
    def __init__(self, nb_cells, hidden_size, vocab_size, embeddings_dim): 
        super(Model, self).__init__()
        self.gru = nn.GRU(embeddings_dim, hidden_size, nb_cells)
        self.embeddings = nn.Embedding(vocab_size, embeddings_dim)
        self.hidden_size = hidden_size
        self.nb_cells = nb_cells
        
    def forward(self, x, hidden):
        embeds = self.embeddings(x)
        gru_out, hidden = self.gru(embeds.view(1,1,embeds.size()[0]), hidden)
        gru_out = gru_out.contiguous().view(-1, self.hidden_size)
        
        return gru_out, hidden
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.nb_cells, batch_size, self.hidden_size).zero_()
        return hidden

In [105]:
## rap data
directory = "rap/Jul"
all = ""
for album in os.listdir(directory):
    dir_album = "{}/{}".format(directory, album)
    for son in os.listdir(dir_album):
        adr = "{}/{}".format(dir_album,son)
        with open(adr, 'r') as f:
            data = f.read()
            decoded_data = data.decode('utf8')
            all+=decoded_data
            
all = all.replace("\x92", "'")
all = all.replace("\x9c", "oe")
all = all.replace("\r", "")


AttributeError: 'str' object has no attribute 'decode'

In [3]:
##american dad data
directory = "scripts_american_dad"
all = ""
for saison in os.listdir(directory):
    dir_saison = "{}/{}".format(directory, saison)
    for ep in os.listdir(dir_saison):
        adr = "{}/{}".format(dir_saison,ep)
        with open(adr, 'r') as f:
            data = f.read()
            all+=data
            

In [4]:
sentences = all.split("\n")
print(sentences[0])
print("nombre de phrases : {}".format(len(sentences)))
sentences = [nltk.word_tokenize(s.lower()) for s in sentences]
sentences = [s for s in sentences if len(s)>0]


Shut up, Steve.
nombre de phrases : 119508


In [5]:
token = nltk.word_tokenize(all.lower())
words = Counter(token)
words = sorted(words, key=words.get, reverse=True)

In [8]:
# print("longueur avant filtrage : %s" %len(words))
# word_freq = 1
# words = {k:v for k,v in words.items() if v>word_freq}
# print("longueur après filtrage : %s" %len(words))

longueur avant filtrage : 29933
longueur après filtrage : 15990


In [6]:
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [7]:
def sentence_to_idx(sentence):
    return [word2idx[w] if w in word2idx else word2idx['unk'] for w in sentence]

In [33]:
###Hyper paramètres
vocab_size = len(word2idx)
embedding_dim = 50
hidden_dim = 512
nb_cells = 2
batch_size = 1

model = Model(nb_cells,hidden_dim, vocab_size, embedding_dim)

lr=0.005
epochs = range(10)
loss_fn = nn.CrossEntropyLoss()
optimizer = th.optim.SGD(model.parameters(), lr=lr)

In [64]:
ids = list(range(len(sentences)))
###Apprentissage
for epoch in epochs:
    total=th.zeros([])
    tx=0
    random.shuffle(ids)
    h = model.init_hidden(1)
    cpt=0
    for i in ids:
        h = h.data
        s = []
        idx = sentence_to_idx(sentences[i])
        in_txt = idx[:-1]
        out_txt = idx[1:]
        tensor = th.LongTensor(out_txt)
        y = model.embeddings(tensor)
        print(y.size())
        for w in in_txt:
            x = th.tensor(w)
            p, h = model.forward(x, h)
            print(p.size(), h.size())
            s.append(p)
        print(len(s))
        print(s[0].size())
        pred = th.LongTensor(s)
        print("pred :",s.size())  
        print("y :", y.size())
        err = loss_fn(pred, y)
        optimizer.zero_grad()
        err.backward()        
        optimizer.step()
        total+=err

    print("epoch : ", epoch, ", loss: ", total)
    print('\n')

torch.Size([5, 50])
torch.Size([1, 512]) torch.Size([2, 1, 512])
torch.Size([1, 512]) torch.Size([2, 1, 512])
torch.Size([1, 512]) torch.Size([2, 1, 512])
torch.Size([1, 512]) torch.Size([2, 1, 512])
torch.Size([1, 512]) torch.Size([2, 1, 512])
5
torch.Size([1, 512])


ValueError: only one element tensors can be converted to Python scalars

In [35]:
ids = list(range(len(sentences)))
###Apprentissage
for epoch in epochs:
    total=th.zeros([])
    tx=0
    random.shuffle(ids)
    h = model.init_hidden()
    print(h.size())
    cpt=0
    for i in ids:
        h = h.data
        idx = sentence_to_idx(sentences[i])
        in_txt = idx
        out_txt = np.zeros_like(in_txt)
        out_txt[:-1] = in_txt[1:]
        out_txt[-1] = in_txt[0]
        in_txt = np.reshape(in_txt, (batch_size, -1))
        out_txt = np.reshape(out_txt, (batch_size, -1))
        
        x = th.LongTensor(in_txt)
        y = th.LongTensor(out_txt)
        p, h = model.forward(x, h)

        err = loss_fn(p, y)
        optimizer.zero_grad()
        err.backward()        
        optimizer.step()
        total+=err

    print("epoch : ", epoch, ", loss: ", total)
    print('\n')

torch.Size([2, 4, 512])
torch.Size([1, 4, 50])


ValueError: Expected input batch_size (4) to match target batch_size (1).

In [19]:
### input size 10 hidden size 20 nb cell 2
rnn = nn.GRU(10, 20, 2)
input = th.randn(1, 3, 10)
h0 = th.randn(2, 3, 20)
print("h0 : ", h0.size())
print("input : ",input.size())
output, hn = rnn(input, h0)

h0 :  torch.Size([2, 3, 20])
input :  torch.Size([1, 3, 10])


In [47]:
input = th.randn(50)
print(input[:])

tensor([ 0.3331, -0.7394, -1.1379, -0.6926,  0.2663,  0.6980, -1.3158, -1.8853,
        -0.3098,  0.7575,  2.1238,  0.5130,  0.3034,  1.7428,  0.0222, -0.2406,
        -1.0988, -0.4314, -0.7167, -0.4764, -0.0560, -0.3233,  0.9115, -0.2394,
        -1.3160,  1.2722, -0.1060,  0.3115,  0.7415, -0.7334, -1.1028, -0.4981,
        -0.1093, -1.2869, -1.0937,  0.6607, -0.7285, -0.4215,  0.2379, -0.1592,
         0.2644, -0.1123,  0.3458,  1.0233,  1.1965,  1.5102,  0.7478,  0.7269,
        -1.1283,  0.4978])


In [24]:
input.size()

torch.Size([50])

In [27]:
input.view(1,1,input.size()[0])

tensor([[[-0.7128,  0.7848, -0.3253,  1.9292, -2.1563, -0.2519,  0.0155,
           1.3889, -1.1436, -0.0023, -0.7813,  1.1168, -0.4223, -1.1302,
          -0.5185, -1.2412, -1.2712,  1.5612, -0.1843,  0.3255,  0.0489,
           0.5452, -0.3858,  1.4748, -1.2805, -1.7520,  0.5905, -1.8502,
           0.6860, -0.9579, -0.5538,  0.5373,  0.2402, -1.0111, -1.3893,
           0.7044,  0.4097,  1.6349, -0.2031,  0.4407, -1.0977, -1.1073,
           2.0980, -0.3602, -0.9485,  0.0915,  0.4984,  0.3607, -0.5489,
          -0.9003]]])

In [31]:
input.resize()

RuntimeError: requested resize to -1 (-1 elements in total), but the given tensor has a size of 50 (50 elements). autograd's resize can only change the shape of a given tensor, while preserving the number of elements. 