## Práctica 6
### Modelos del lenguaje

**Fecha de entrega**  
21 de abril de 2024

- Crear un par de modelos del lenguaje usando un corpus en español|
    - Corpus: El Quijote
    - URL: https://www.gutenberg.org/ebooks/2000
    - Modelo de n-gramas con n = [2, 3]
    - Hold out con test = 30% y train = 70%
- Evaluar los modelos y reportar la perplejidad de cada modelo
    - Comparar los resultados entre los diferentes modelos del lenguaje (bigramas, trigramas)
    - ¿Cual fue el modelo mejor evaluado? ¿Porqué?

In [2]:
"""
Install dependencies
"""

%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
"""
Download spanish language corpus

El Quijote
https://www.gutenberg.org/ebooks/2000
"""

import numpy as np
import requests
import string

# url = "https://www.gutenberg.org/ebooks/2000.txt.utf-8"
# response = requests.get(url)

# with open('quijote.txt', 'wb') as file:
#     file.write(response.content)

In [2]:
"""
Preprocessing
"""

with open('quijote.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

# perform preprocessing

# trim lines
text = [line.strip() for line in text if line.strip()]

# make lowercase
text = [line.lower() for line in text]

# remove punctuation
text = [''.join([c for c in line if c not in string.punctuation]) for line in text]

# remove special characters and numbers
text = [''.join([c for c in line if c.isalpha() or c == ' ']) for line in text]

# add <BOS> and <EOS> tokens
text = ['<BOS> ' + line + ' <EOS>' for line in text]

# split into words
text = [line.split() for line in text]


print('example sentence:')
print(text[np.random.randint(len(text))])

example sentence:
['<BOS>', 'y', 'describir', 'punto', 'por', 'punto', 'y', 'parte', 'por', 'parte', 'la', 'hermosura', 'de', 'la', 'sin', 'par', '<EOS>']


In [3]:
"""
Split training and test data
"""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(text, test_size=0.2)

# print out a random sample of the training data and the test data
print('Training data:')
print(train_data[np.random.randint(len(train_data))])
print('Test data:')
print(test_data[np.random.randint(len(test_data))])

Training data:
['<BOS>', 'porque', 'eran', 'seis', 'medias', 'tinajas', 'que', 'cada', 'una', 'cabía', 'un', 'rastro', 'de', 'carne', 'así', '<EOS>']
Test data:
['<BOS>', 'donde', 'se', 'prosigue', 'la', 'noticia', 'que', 'tuvo', 'don', 'quijote', '<EOS>']


In [4]:
import nltk

from nltk import ngrams
from collections import Counter, defaultdict

list(ngrams(text[np.random.randint(len(text))], 3))

[('<BOS>', 'den', 'entre'),
 ('den', 'entre', 'dos'),
 ('entre', 'dos', 'platos'),
 ('dos', 'platos', 'a'),
 ('platos', 'a', 'buen'),
 ('a', 'buen', 'seguro'),
 ('buen', 'seguro', 'que'),
 ('seguro', 'que', 'el'),
 ('que', 'el', 'caballo'),
 ('el', 'caballo', 'no'),
 ('caballo', 'no', 'la'),
 ('no', 'la', 'arrostre'),
 ('la', 'arrostre', '<EOS>')]

In [5]:
"""
Train tri-gram model using nltk
"""

# a trigram model is a dictionary of dictionaries
# by default the inner dictionary is a defaultdict with a default value of 0
# i.e. if a key is not found in the dictionary, it will return 0
# this is useful for counting the number of times a word appears after a bigram
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2, w3 in ngrams(sentence, 3):
        trigram_model[(w1, w2)][w3] += 1

trigram_model["<BOS>", "the"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {})

In [6]:
import itertools

for entry in itertools.islice(trigram_model.items(), 4):
    print(entry)

(('<BOS>', 'buen'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001B962BE5620>, {'hombre': 3, 'término': 1, 'seguro': 2, 'deseo': 2, 'talante': 1, 'rostro': 1, 'suceso': 1, 'árbol': 1, 'número': 1, 'espacio': 1, 'pecho': 1, 'ingenio': 1, 'caballero': 1, 'lenguaje': 1, 'entendimiento': 1}))
(('buen', 'hombre'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001B962BE58A0>, {'andad': 1, 'que': 4, '<EOS>': 5, 'id': 1, 'respondió': 1, 'cómo': 1, 'es': 2, 'porque': 1, 'albarda': 1, 'este': 1, 'dice': 1, 'me': 1, 'ese': 1, 'quería': 1, 'deteneos': 1}))
(('hombre', 'andad'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001B962BE5940>, {'con': 1}))
(('andad', 'con'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001B962BE59E0>, {'dios': 6}))


In [7]:
VOCABULARY = set([word.lower() for sent in train_data for word in sent])
VOCABULARY_SIZE = len(VOCABULARY) + 2

In [8]:
def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result

In [9]:
trigram_probs = calculate_model_probabilities(trigram_model)

In [10]:
sorted(dict(trigram_probs["que", "es"]).items(), key=lambda x: -1 * x[1])

[('el', 0.06666666666666667),
 ('lo', 0.06296296296296296),
 ('la', 0.05925925925925926),
 ('de', 0.04814814814814815),
 ('una', 0.040740740740740744),
 ('un', 0.040740740740740744),
 ('<EOS>', 0.040740740740740744),
 ('más', 0.025925925925925925),
 ('tan', 0.025925925925925925),
 ('menester', 0.025925925925925925),
 ('caballero', 0.018518518518518517),
 ('verdad', 0.018518518518518517),
 ('posible', 0.014814814814814815),
 ('muy', 0.014814814814814815),
 ('mi', 0.014814814814814815),
 ('nuestro', 0.011111111111111112),
 ('tal', 0.011111111111111112),
 ('gente', 0.011111111111111112),
 ('como', 0.011111111111111112),
 ('ahora', 0.011111111111111112),
 ('razón', 0.007407407407407408),
 ('a', 0.007407407407407408),
 ('poco', 0.007407407407407408),
 ('grande', 0.007407407407407408),
 ('gran', 0.007407407407407408),
 ('tanto', 0.007407407407407408),
 ('discreto', 0.007407407407407408),
 ('su', 0.007407407407407408),
 ('oficio', 0.007407407407407408),
 ('todo', 0.007407407407407408),
 ('opi

In [11]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(trigram_probs, "<BOS> el", top_count=3)

[('cual', 0.05555555555555555),
 ('que', 0.040123456790123455),
 ('ventero', 0.027777777777777776)]

In [12]:
from random import randint


def get_next_word(words: list) -> str:
    # Strategy here
    return words[0][0]


def get_next_word(words: list) -> str:
    return words[randint(0, len(words) - 1)][0]

get_next_word(get_likely_words(trigram_probs, "<BOS> el", 50))

'mono'

In [13]:
MAX_TOKENS = 30

def generate_text(model: defaultdict, history: str, tokens_count: int) -> None:
    next_word = get_next_word(get_likely_words(model, history, top_count=30))
    print(next_word, end=" ")
    tokens_count += 1
    if tokens_count == MAX_TOKENS or next_word == "<EOS>":
        return
    generate_text(model, history.split()[1] + " " + next_word, tokens_count)

sentence = "<BOS> el"
print(sentence, end=" ")
generate_text(trigram_probs, sentence, 0)

<BOS> el más humilde y subieron a ser venturosas <EOS> 

In [33]:
def calculate_sent_prob(model: defaultdict, sentence: str, n: int) -> float:
    n_grams = ngrams(sentence, n)
    p = 0.0
    for gram in n_grams:
        if n == 3:
            key = (gram[0], gram[1])
            value = gram[2]
        elif n == 2:
            key = gram[0]
            value = gram[1]
        try:
            if model[key][value] == 0:
                # Laplace smoothing
                p += np.log(1 / VOCABULARY_SIZE)
                continue
            log_prob = np.log(model[key][value])
            # skip inf values
            if log_prob == float("-inf"):
                continue
            p += log_prob
        except:
            p += 0.0
    return p

In [34]:
sentence = train_data[0]
print(" ".join(sentence))
calculate_sent_prob(trigram_probs, train_data[10], n=3)

<BOS> buen hombre andad con dios a vuestro lugar con vuestro dinero y de aquí <EOS>


-22.21850428505548

In [36]:
"""
Calculate perplexity
"""

# perplexity is a measure of how well a probability model predicts a sample
# we will use test_data to calculate perplexity for a model trained with train_data

perplexities = []
for sentence in test_data:
    log_prob = calculate_sent_prob(trigram_probs, sentence, n=3)
    if (log_prob == float('inf')):
        print(sentence)
        break
    if(len(sentence) == 1):
        print(sentence)
        break
    perplexity = -(log_prob / len(sentence) - 1)
    perplexities.append(perplexity)

test_data[0]

total_perplexity = sum(perplexities) / len(perplexities)
total_perplexity

7.457602378519944