### Práctica 8. Modelos de lenguaje neuronales

#### Actividades

- Construir un modelo del lenguaje neuronal a partir de un corpus en español
    - Corpus: El Quijote. URL: https://www.gutenberg.org/ebooks/2000
        - NOTA: Considera los recursos de computo. Recuerda que en la practica utilizamos ~50k oraciones
    - Modelo de trigramas con n = 3
    - Incluye informacion sobre setup de entrenamiento:
        - Dimension de embeddings
        - Dimsension de capa oculta
        - Cantidad de oraciones para entrenamiento
        - Batch size y context size
    - Incluye la liga de drive de tu modelo

- Imprima en pantalla un tres ejemplos de generacion de texto
    - Proponga mejoras en las estrategias de generación de texto vistas en la práctica
    - Decriba en que consiste la estrategia propuesta
    - Compare la estrategia de la práctica y su propuesta

- Visualizar en 2D los vectores de las palabras más comunes (excluir STOP WORDS)

In [None]:
"""
Install the required packages
"""

!pip install nltk numpy torch

In [None]:
import re
import numpy as np
import nltk

from nltk.corpus import reuters
from nltk import ngrams
from collections import Counter, defaultdict

nltk.download("reuters")
nltk.download("punkt")


def preprocess_corpus(corpus: list[str]) -> list[str]:
    """Función de preprocesamiento

    Agrega tokens de inicio y fin, normaliza todo a minusculas
    """
    preprocessed_corpus = []
    for sent in corpus:
        result = [word.lower() for word in sent]
        result.append("<EOS>")
        result.insert(0, "<BOS>")
        preprocessed_corpus.append(result)
    return preprocessed_corpus

def get_words_freqs(corpus: list[list[str]]):
    words_freqs = {}
    for sentence in corpus:
        for word in sentence:
            words_freqs[word] = words_freqs.get(word, 0) + 1
    return words_freqs

UNK_LABEL = "<UNK>"
def get_words_indexes(words_freqs: dict) -> dict:
    result = {}
    for idx, word in enumerate(words_freqs.keys()):
        if words_freqs[word] == 1:
            result[UNK_LABEL] = len(words_freqs)
        else:
            result[word] = idx

    return {word: idx for idx, word in enumerate(result.keys())}, {idx: word for idx, word in enumerate(result.keys())}

corpus = preprocess_corpus(reuters.sents())
words_freqs = get_words_freqs(corpus)

print(len(corpus))
print(words_freqs["the"])
print(len(words_freqs))

In [None]:
count = 0
for word, freq in words_freqs.items():
    if freq == 1 and count <= 10:
        print(word, freq)
        count += 1

words_indexes, index_to_word = get_words_indexes(words_freqs)
words_indexes["the"]
index_to_word[16]
len(words_indexes)
len(index_to_word)


def get_word_id(words_indexes: dict, word: str) -> int:
    unk_word_id = words_indexes[UNK_LABEL]
    return words_indexes.get(word, unk_word_id)


def get_train_test_data(
    corpus: list[list[str]], words_indexes: dict, n: int
) -> tuple[list, list]:
    x_train = []
    y_train = []
    for sent in corpus:
        n_grams = ngrams(sent, n)
        for w1, w2, w3 in n_grams:
            x_train.append(
                [get_word_id(words_indexes, w1), get_word_id(words_indexes, w2)]
            )
            y_train.append([get_word_id(words_indexes, w3)])
    return x_train, y_train

In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import time

from torch import nn
from torch.utils.data import DataLoader

EMBEDDING_DIM = 200
CONTEXT_SIZE = 2
BATCH_SIZE = 256
H = 100
torch.manual_seed(19)
V = len(words_indexes)

x_train, y_train = get_train_test_data(corpus, words_indexes, n=3)

train_set = np.concatenate((x_train, y_train), axis=1)
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE)

In [None]:
class TrigramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(TrigramModel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size)

    def forward(self, inputs):
        #self.embeddings regresa un vector por cada uno de los índices que se les pase como entrada. view() les cambia el tamaño para concatenarlos
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        out = torch.tanh(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


loss_function = nn.NLLLoss()

# Otras opciones de función de pérdida (tendrían que usar softmax sin log):
# nn.CrossEntropyLoss()


model = TrigramModel(V, EMBEDDING_DIM, CONTEXT_SIZE, H)

optimizer = optim.Adam(model.parameters(), lr = 2e-3)

# Otras opciones de optimizador:
# optimizer = optim.SGD(model.parameters(), lr=0.1)


In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:, 0:2]
        target_tensor = data_tensor[:, 2]

        model.zero_grad()  # reinicializar los gradientes
        # FORWARD:
        log_probs = model(context_tensor)

        loss = loss_function(log_probs, target_tensor)

        # BACKWARD:
        loss.backward()
        optimizer.step()

        if it % 500 == 0:
            print(
                "Training Iteration {} of epoch {} complete. Loss: {}; Time taken (s): {}".format(
                    it, epoch, loss.item(), (time.time() - st)
                )
            )
            st = time.time()
            # barch_size x len(vocab)

    model_path = "model_{}.dat".format(epoch)
    torch.save(model.state_dict(), model_path)
    print(f"Model saved for epoch={epoch} at {model_path}")

model

In [None]:
def get_model(path: str) -> TrigramModel:
    model_loaded = TrigramModel(V, EMBEDDING_DIM, CONTEXT_SIZE, H)
    model_loaded.load_state_dict(torch.load(path))
    model_loaded.eval()
    return model_loaded


PATH = "drive/MyDrive/LM_neuronal/model_0.dat"

model = get_model(PATH)
W1 = "<BOS>"
W2 = "my"

IDX1 = get_word_id(words_indexes, W1)
IDX2 = get_word_id(words_indexes, W2)

# Obtenemos Log probabidades p(W3|W2,W1)
probs = model(torch.tensor([[IDX1, IDX2]])).detach().tolist()

len(probs[0])

model_probs = {}
for idx, p in enumerate(probs[0]):
    model_probs[idx] = p

model_probs_sorted = sorted(
    ((prob, idx) for idx, prob in model_probs.items()), reverse=True
)

topcandidates = 0
for prob, idx in model_probs_sorted:
    # Retrieve the word associated with that idx
    word = index_to_word[idx]
    print(idx, word, prob)

    topcandidates += 1

    if topcandidates > 100:
        break

index_to_word.get(model_probs_sorted[0][0])


def get_likely_words(
    model: TrigramModel,
    context: str,
    words_indexes: dict,
    index_to_word: dict,
    top_count: int = 10,
) -> list[tuple]:
    model_probs = {}
    words = context.split()
    idx_word_1 = get_word_id(words_indexes, words[0])
    idx_word_2 = get_word_id(words_indexes, words[1])
    probs = model(torch.tensor([[idx_word_1, idx_word_2]])).detach().tolist()

    for idx, p in enumerate(probs[0]):
        model_probs[idx] = p

    return sorted(
        ((prob, index_to_word[idx]) for idx, prob in model_probs.items()), reverse=True
    )[:top_count]


sentence = "this is"
get_likely_words(model, sentence, words_indexes, index_to_word, 3)

from random import randint


def get_next_word(words: list[tuple[float, str]]) -> str:
    return words[randint(0, len(words) - 1)][1]


get_next_word(get_likely_words(model, sentence, words_indexes, index_to_word))

In [None]:
MAX_TOKENS = 30
TOP_COUNT = 10


def generate_text(
    model: TrigramModel,
    history: str,
    words_indexes: dict,
    index_to_word: dict,
    tokens_count: int = 0,
) -> None:
    next_word = get_next_word(
        get_likely_words(
            model, history, words_indexes, index_to_word, top_count=TOP_COUNT
        )
    )
    print(next_word, end=" ")
    tokens_count += 1
    if tokens_count == MAX_TOKENS or next_word == "<EOS>":
        return
    generate_text(
        model,
        history.split()[1] + " " + next_word,
        words_indexes,
        index_to_word,
        tokens_count,
    )


sentence = "<BOS> the"
print(sentence, end=" ")
generate_text(model, sentence, words_indexes, index_to_word)


word = input(">> ")
words_tensor = torch.LongTensor([get_word_id(words_indexes, word)])
word_embed = model.embeddings(words_tensor)
print(f"embbeding (dim={len(word_embed[0])}) vec for word={word}")
word_embed

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

words = ["the", "gold", "car", "nation"]
for word in words:
    base_word_embed = (
        model.embeddings(torch.LongTensor([get_word_id(words_indexes, word)]))
        .detach()
        .numpy()
    )

    word_sims = {}
    for other_word in words_indexes.keys():
        if word == other_word:
            continue
        other_word_embed = (
            model.embeddings(torch.LongTensor([get_word_id(words_indexes, other_word)]))
            .detach()
            .numpy()
        )
        word_sims[other_word] = cosine_similarity(base_word_embed, other_word_embed)

    print("\nBASE WORD =", word)
    for word, sim in sorted(word_sims.items(), key=lambda item: item[1], reverse=True)[
        :10
    ]:
        print(f"{word}: {sim[0][0]}")

```python
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loss_function = nn.NLLLoss()
model = TrigramModel(V, EMBEDDING_DIM, CONTEXT_SIZE, H).to(device)
optimizer = optim.Adam(model.parameters(), lr = 2e-3)

EPOCHS = 3
for epoch in range(EPOCHS):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch))
    for it, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:2].to(device)
        target_tensor = data_tensor[:,2].to(device)

        model.zero_grad()

        log_probs = model(context_tensor)

        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimizer.step()

        if it % 500 == 0:
            print("Training Iteration {} of epoch {} complete. Loss: {}; Time taken (s): {}".format(it, epoch, loss.item(), (time.time()-st)))
            st = time.time()

    model_path = 'drive/MyDrive/LM_neuronal/model_gpu_{}.dat'.format(epoch)
    torch.save(model.state_dict(), model_path)
    print(f"Model saved for epoch={epoch} at {model_path}")
```