## Práctica 6
### Modelos del lenguaje

**Fecha de entrega**  
21 de abril de 2024

- Crear un par de modelos del lenguaje usando un corpus en español|
    - Corpus: El Quijote
    - URL: https://www.gutenberg.org/ebooks/2000
    - Modelo de n-gramas con n = [2, 3]
    - Hold out con test = 30% y train = 70%
- Evaluar los modelos y reportar la perplejidad de cada modelo
    - Comparar los resultados entre los diferentes modelos del lenguaje (bigramas, trigramas)
    - ¿Cual fue el modelo mejor evaluado? ¿Porqué?

In [2]:
"""
Install dependencies
"""

%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
"""
Download spanish language corpus

El Quijote
https://www.gutenberg.org/ebooks/2000
"""

import numpy as np
import requests
import string

# url = "https://www.gutenberg.org/ebooks/2000.txt.utf-8"
# response = requests.get(url)

# with open('quijote.txt', 'wb') as file:
#     file.write(response.content)

In [4]:
"""
Preprocessing
"""

with open('quijote.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

# perform preprocessing

# trim lines
text = [line.strip() for line in text if line.strip()]

# make lowercase
text = [line.lower() for line in text]

# remove punctuation
text = [''.join([c for c in line if c not in string.punctuation]) for line in text]

# remove special characters and numbers
text = [''.join([c for c in line if c.isalpha() or c == ' ']) for line in text]

# add <BOS> and <EOS> tokens
text = ['<BOS> ' + line + ' <EOS>' for line in text]

# split into words
text = [line.split() for line in text]


print('example sentence:')
print(text[np.random.randint(len(text))])

example sentence:
['<BOS>', 'febrero', 'de', 'mil', 'y', 'seiscientos', 'y', 'quince', '<EOS>']


In [5]:
"""
Split training and test data
"""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(text, test_size=0.2)

# print out a random sample of the training data and the test data
print('Training data:')
print(train_data[np.random.randint(len(train_data))])
print('Test data:')
print(test_data[np.random.randint(len(test_data))])

Training data:
['<BOS>', 'no', 'se', 'anduviera', 'hocicando', 'con', 'alguno', 'de', 'los', 'que', 'están', 'en', 'la', 'rueda', 'a', 'vuelta', '<EOS>']
Test data:
['<BOS>', 'triste', 'armonía', 'especialmente', 'don', 'quijote', 'que', 'no', 'cabía', 'en', 'su', 'asiento', 'de', '<EOS>']


In [6]:
import nltk

from nltk import ngrams
from collections import Counter, defaultdict

list(ngrams(text[np.random.randint(len(text))], 3))

[('<BOS>', 'espejos', 'que'),
 ('espejos', 'que', 'a'),
 ('que', 'a', 'sus'),
 ('a', 'sus', 'pies'),
 ('sus', 'pies', 'tiene'),
 ('pies', 'tiene', 'porque'),
 ('tiene', 'porque', 'sin'),
 ('porque', 'sin', 'duda'),
 ('sin', 'duda', 'alguna'),
 ('duda', 'alguna', 'es'),
 ('alguna', 'es', 'el'),
 ('es', 'el', 'atrevido'),
 ('el', 'atrevido', 'y'),
 ('atrevido', 'y', 'mal'),
 ('y', 'mal', '<EOS>')]

In [7]:
"""
Train tri-gram model using nltk
"""

# a trigram model is a dictionary of dictionaries
# by default the inner dictionary is a defaultdict with a default value of 0
# i.e. if a key is not found in the dictionary, it will return 0
# this is useful for counting the number of times a word appears after a bigram
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2, w3 in ngrams(sentence, 3):
        trigram_model[(w1, w2)][w3] += 1

trigram_model["<BOS>", "the"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {})

In [8]:
import itertools

for entry in itertools.islice(trigram_model.items(), 4):
    print(entry)

(('<BOS>', 'renombre'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000155E10FCF40>, {'de': 1, 'famoso': 1}))
(('renombre', 'de'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000155E10FCEA0>, {'loco': 1, '<EOS>': 1, 'magno': 1, 'valiente': 1}))
(('de', 'loco'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000155E10FD260>, {'que': 2, 'y': 1, 'a': 1}))
(('loco', 'que'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x00000155E10FD3A0>, {'puesto': 1, '<EOS>': 2, 'me': 1, 'de': 1, 'a': 2, 'estaba': 1, 'con': 1, 'por': 1, 'tiraba': 1, 'dio': 1, 'ya': 1}))


In [9]:
VOCABULARY = set([word.lower() for sent in train_data for word in sent])
VOCABULARY_SIZE = len(VOCABULARY) + 2

In [10]:
def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result

In [11]:
trigram_probs = calculate_model_probabilities(trigram_model)

In [13]:
sorted(dict(trigram_probs["que", "es"]).items(), key=lambda x: -1 * x[1])

[('el', 0.07662835249042145),
 ('lo', 0.05747126436781609),
 ('la', 0.05363984674329502),
 ('un', 0.05363984674329502),
 ('de', 0.04980842911877394),
 ('<EOS>', 0.04597701149425287),
 ('una', 0.0421455938697318),
 ('más', 0.034482758620689655),
 ('menester', 0.034482758620689655),
 ('tan', 0.022988505747126436),
 ('muy', 0.019157088122605363),
 ('caballero', 0.01532567049808429),
 ('a', 0.011494252873563218),
 ('como', 0.011494252873563218),
 ('tal', 0.011494252873563218),
 ('ahora', 0.011494252873563218),
 ('mi', 0.011494252873563218),
 ('posible', 0.011494252873563218),
 ('nuestro', 0.011494252873563218),
 ('razón', 0.011494252873563218),
 ('opinión', 0.007662835249042145),
 ('verdad', 0.007662835249042145),
 ('su', 0.007662835249042145),
 ('tiempo', 0.007662835249042145),
 ('uno', 0.007662835249042145),
 ('todo', 0.007662835249042145),
 ('mejor', 0.007662835249042145),
 ('suyo', 0.007662835249042145),
 ('en', 0.007662835249042145),
 ('gente', 0.007662835249042145),
 ('grande', 0.007

In [16]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(trigram_probs, "<BOS> el", top_count=3)

[('cual', 0.05198776758409786),
 ('duque', 0.03058103975535168),
 ('que', 0.03058103975535168)]

In [18]:
from random import randint


def get_next_word(words: list) -> str:
    # Strategy here
    return words[0][0]


def get_next_word(words: list) -> str:
    return words[randint(0, len(words) - 1)][0]

get_next_word(get_likely_words(trigram_probs, "<BOS> el", 50))

'barbero'

In [20]:
MAX_TOKENS = 30

def generate_text(model: defaultdict, history: str, tokens_count: int) -> None:
    next_word = get_next_word(get_likely_words(model, history, top_count=30))
    print(next_word, end=" ")
    tokens_count += 1
    if tokens_count == MAX_TOKENS or next_word == "<EOS>":
        return
    generate_text(model, history.split()[1] + " " + next_word, tokens_count)

sentence = "<BOS> el"
print(sentence, end=" ")
generate_text(trigram_probs, sentence, 0)

<BOS> el tiempo a mi linaje noble mis padres la cual lamentable historia <EOS> 

In [21]:
def calculate_sent_prob(model: defaultdict, sentence: str, n: int) -> float:
    n_grams = ngrams(sentence, n)
    p = 0.0
    for gram in n_grams:
        if n == 3:
            key = (gram[0], gram[1])
            value = gram[2]
        elif n == 2:
            key = gram[0]
            value = gram[1]
        try:
            p += np.log(model[key][value])
        except:
            p += 0.0
    return p

In [22]:
sentence = train_data[0]
print(" ".join(sentence))
calculate_sent_prob(trigram_probs, train_data[10], n=3)

<BOS> renombre de loco que puesto que lo he sido no querría confirmar esta <EOS>


-23.791575488443925

In [23]:
"""
Calculate perplexity
"""

# perplexity is a measure of how well a probability model predicts a sample
# we will use test_data to calculate perplexity for a model trained with train_data

perplexities = []
for sentence in test_data:
    log_prob = calculate_sent_prob(trigram_probs, sentence, 3)
    perplexity = -(log_prob / len(sentence) - 1)
    perplexities.append(perplexity)

total_perplexity = sum(perplexities) / len(perplexities)

  p += np.log(model[key][value])
