<a href="https://colab.research.google.com/github/vicentcamison/idal_ia3/blob/main/3%20Aprendizaje%20profundo%20(II)/Sesion%206/8_Text_generation_jokes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text generation in PyTorch using LSTMs

This example tries to use LSTMs in seq2seq scheme to predict next words in a joke, given a seed string.

## Main imports

In [5]:
import torch
from torch import nn, optim
import numpy as np
from torch.utils.data import DataLoader, Dataset

#imports de Juan
import pandas as pd
import collections

## Device selection

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Data

We will use Reddit clean jokes dataset to train the network. Let us create a function to create a Dataset (for help on custom PyTorch Datasets check [this link](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class))

In [8]:
#El único argumento a pasar a Dataset al instanciarla será la longitud de secuencia: sequence_length
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length
    ):
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    #Carga el dataset y crea una lista con todas las palabras que contiene. Deberemos leer la columna
    #‘Joke’ del archivo como un data frame, concatenar las diferentes filas (chistes) pasadas a string mediante espacios ‘ ’
    #y finalmente aplicar un .split(‘ ‘) a la cadena para tokenizarla.
    def load_words(self):
        train_df = pd.read_csv('https://raw.githubusercontent.com/vicentcamison/idal_ia3/main/3%20Aprendizaje%20profundo%20(II)/Sesion%206/reddit-cleanjokes.csv')
        text = ' '.join(train_df.Joke)
        return text.split(' ')

    #contar las palabras únicas y ordenarlas en orden descendente de frecuencia.
    def get_uniq_words(self):
        word_counts = collections.Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )



In [9]:
sequence_length=4
dataset=Dataset(sequence_length=sequence_length)
print(len(dataset))
print(dataset[0])
print(dataset[1])
print(len(dataset.get_uniq_words()))

23910
(tensor([  2,   8,   0, 248]), tensor([  8,   0, 248,  20]))
(tensor([  8,   0, 248,  20]), tensor([  0, 248,  20,   4]))
6925


# Model

Define a model based on LSTM with default input_size=128, hidden_size=128, num_layers=3

In [10]:
class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 1

        n_vocab = len(dataset.get_uniq_words()) #número de palabras diferentes del dataset
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
            bidirectional=False
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)                          

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),   # for bilstm: num_layers * 2 
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))   # for bilstm: num_layers * 2

model=Model(dataset).to(device)
print(model)        


  "num_layers={}".format(dropout, num_layers))


Model(
  (embedding): Embedding(6925, 128)
  (lstm): LSTM(128, 128, dropout=0.2)
  (fc): Linear(in_features=128, out_features=6925, bias=True)
)


## Training

In [13]:
batch_size = 256
max_epochs = 20

model.train()

dataloader = DataLoader(dataset=Dataset(sequence_length=sequence_length), batch_size=256)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr=0.001, params=model.parameters())

for epoch in range(max_epochs):
  state_h, state_c = model.init_state(sequence_length=sequence_length)

  for batch, (x, y) in enumerate(dataloader):
    x, y = x.to(device), y.to(device)
    optimizer.zero_grad()
    y_pred, _ = model(x, (state_h.to(device), state_c.to(device)))
    loss = criterion(y_pred.transpose(1, 2), y)
    loss.backward()
    optimizer.step()

    print({ 'epoch': epoch+1, 'batch': batch+1, 'loss': loss.item() })

{'epoch': 1, 'batch': 1, 'loss': 8.837660789489746}
{'epoch': 1, 'batch': 2, 'loss': 8.836078643798828}
{'epoch': 1, 'batch': 3, 'loss': 8.825660705566406}
{'epoch': 1, 'batch': 4, 'loss': 8.82593822479248}
{'epoch': 1, 'batch': 5, 'loss': 8.820954322814941}
{'epoch': 1, 'batch': 6, 'loss': 8.807294845581055}
{'epoch': 1, 'batch': 7, 'loss': 8.82420825958252}
{'epoch': 1, 'batch': 8, 'loss': 8.80030632019043}
{'epoch': 1, 'batch': 9, 'loss': 8.789654731750488}
{'epoch': 1, 'batch': 10, 'loss': 8.788488388061523}
{'epoch': 1, 'batch': 11, 'loss': 8.787675857543945}
{'epoch': 1, 'batch': 12, 'loss': 8.756314277648926}
{'epoch': 1, 'batch': 13, 'loss': 8.780217170715332}
{'epoch': 1, 'batch': 14, 'loss': 8.78335952758789}
{'epoch': 1, 'batch': 15, 'loss': 8.722908973693848}
{'epoch': 1, 'batch': 16, 'loss': 8.73049545288086}
{'epoch': 1, 'batch': 17, 'loss': 8.696094512939453}
{'epoch': 1, 'batch': 18, 'loss': 8.64804458618164}
{'epoch': 1, 'batch': 19, 'loss': 8.670186042785645}
{'epoch'

## Predict

In [14]:
def sample_prediction(preds, temperature=1.0):
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

In [15]:
next_words=100
model.eval()

text='Knock knock. Whos there?'
words = text.split(' ')
state_h, state_c = model.init_state(len(words))

for i in range(0, next_words):
  x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
  y_pred, (state_h, state_c) = model(x.to(device), (state_h.to(device), state_c.to(device)))

  last_word_logits = y_pred[0][-1]
  p = torch.nn.functional.softmax(last_word_logits, dim=0).cpu().detach().numpy()
  #word_index = np.random.choice(len(last_word_logits), p=p)
  word_index = sample_prediction(p, temperature=0.3)
  words.append(dataset.index_to_word[word_index])

print(words)

['Knock', 'knock.', 'Whos', 'there?', 'Interrupting', 'cow.', 'Interrup........', 'MOOOOOOOOOOOOOOOO!!!!', '[Works', 'best', 'IRL](/spoiler)', 'The', 'other', 'day,', 'I', 'was', 'a', 'little', 'crabby!', 'Did', 'you', 'hear', 'about', 'the', 'other', 'bride.', 'Why', 'did', 'the', 'chicken', 'cross', 'the', 'road?', 'To', 'get', 'to', 'the', 'other?', 'Hey,', "it's", 'not', 'a', 'joke', 'about', 'pizza?', 'Never', 'mind,', "it's", 'not', 'going', 'to', 'be', 'a', 'joke', 'about', 'pizza?', 'Never', 'mind,', "it's", 'not', 'a', 'joke', 'about', 'sodium.', 'But', 'it', 'turned', 'out', 'of', 'a', 'pyramid', 'scheme.', 'I', 'just', 'made', 'up', 'and', 'a', 'joke', 'about', 'pizza?', 'Never', 'mind,', "it's", 'a', 'lot', 'of', 'hops', 'What', 'did', 'the', 'fish', 'say', 'when', 'it', 'hit', 'the', 'wall?', 'Dam', '16', 'sodium', 'atoms', 'walk', 'into']
