## Práctica 6
### Modelos del lenguaje

**Fecha de entrega**  
21 de abril de 2024

- Crear un par de modelos del lenguaje usando un corpus en español|
    - Corpus: El Quijote
    - URL: https://www.gutenberg.org/ebooks/2000
    - Modelo de n-gramas con n = [2, 3]
    - Hold out con test = 30% y train = 70%
- Evaluar los modelos y reportar la perplejidad de cada modelo
    - Comparar los resultados entre los diferentes modelos del lenguaje (bigramas, trigramas)
    - ¿Cual fue el modelo mejor evaluado? ¿Porqué?

In [2]:
"""
Install dependencies
"""

%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
"""
Download spanish language corpus

El Quijote
https://www.gutenberg.org/ebooks/2000
"""

import requests

url = "https://www.gutenberg.org/ebooks/2000.txt.utf-8"
response = requests.get(url)

with open('quijote.txt', 'wb') as file:
    file.write(response.content)

In [6]:
"""
Preprocessing
"""

import string


with open('quijote.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

# perform preprocessing

# trim lines
text = [line.strip() for line in text if line.strip()]

# make lowercase
text = [line.lower() for line in text]

# remove punctuation
text = [''.join([c for c in line if c not in string.punctuation]) for line in text]

# remove special characters and numbers
text = [''.join([c for c in line if c.isalpha() or c == ' ']) for line in text]

# add <BOS> and <EOS> tokens
text = ['<BOS> ' + line + ' <EOS>' for line in text]

# split into words
text = [line.split() for line in text]


print('example sentence:')
print(text[np.random.randint(len(text))])

example sentence:
['<BOS>', 'había', 'hecho', 'muestra', 'de', 'más', 'de', 'diez', 'pares', 'de', 'vestidos', 'y', 'de', 'más', 'de', 'veinte', '<EOS>']


In [9]:
"""
Split training and test data
"""

import numpy as np

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(text, test_size=0.2)

# print out a random sample of the training data and the test data
print('Training data:')
print(train_data[np.random.randint(len(train_data))])
print('Test data:')
print(test_data[np.random.randint(len(test_data))])

Training data:
['<BOS>', 'hacer', 'un', 'rimero', 'dellos', 'y', 'pegarles', 'fuego', 'y', 'si', 'no', 'llevarlos', 'al', 'corral', 'y', '<EOS>']
Test data:
['<BOS>', 'así', 'es', 'verdad', 'replicó', 'don', 'quijote', 'porque', 'no', 'fuera', 'acertado', 'que', 'los', '<EOS>']


In [None]:
# alternative preprocessing using nltk

# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer, SnowballStemmer
# from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# stemmer = SnowballStemmer("english")
# lemmatizer = WordNetLemmatizer()

# def preprocess(text):
#     result = []
#     for token in word_tokenize(text):
#         if len(token) > 2 and token not in stopwords.words('english'):
#             token = stemmer.stem(lemmatizer.lemmatize(token, pos='v'))  # Lemmatize and stem token
#             result.append(token)
#     return result

# with open('quijote.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# processed_text = preprocess(text.lower())

In [10]:
import nltk

from nltk import ngrams
from collections import Counter, defaultdict

list(ngrams(text[0], 3))


[('<BOS>', 'el', 'ingenioso'),
 ('el', 'ingenioso', 'hidalgo'),
 ('ingenioso', 'hidalgo', 'don'),
 ('hidalgo', 'don', 'quijote'),
 ('don', 'quijote', 'de'),
 ('quijote', 'de', 'la'),
 ('de', 'la', 'mancha'),
 ('la', 'mancha', '<EOS>')]

In [None]:
"""
Train bi-gram model using nltk
"""

In [14]:
"""
Train tri-gram model using nltk
"""

# a trigram model is a dictionary of dictionaries
# by default the inner dictionary is a defaultdict with a default value of 0
# i.e. if a key is not found in the dictionary, it will return 0
# this is useful for counting the number of times a word appears after a bigram
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2, w3 in ngrams(sentence, 3):
        trigram_model[(w1, w2)][w3] += 1

trigram_model["<BOS>", "the"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {})

In [17]:
import itertools

for entry in itertools.islice(trigram_model.items(), 4):
    print(entry)

(('<BOS>', 'que'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000002A717C37E20>, {'trata': 18, 'estás': 1, 'es': 18, 'será': 1, 'no': 62, 'me': 40, 'para': 4, 'los': 21, 'era': 10, 'ni': 3, 'se': 35, 'había': 9, 'temes': 1, 'a': 21, 'este': 3, 'la': 18, 'el': 24, 'en': 45, 'fue': 6, 'vais': 1, 'alborozado': 1, 'viene': 3, 'anduvistes': 1, 'yo': 23, 'si': 16, 'entre': 5, 'por': 11, 'cuenta': 1, 'lo': 13, 'volaban': 1, 'creo': 1, 'esta': 3, 'él': 16, 'damas': 1, 'solapar': 1, 'ana': 1, 'don': 4, 'repica': 1, 'como': 3, 'echalle': 1, 'son': 5, 'las': 10, 'del': 3, 'tengo': 8, 'os': 8, 'tenía': 3, 'luego': 1, 'resta': 1, 'iba': 2, 'volvió': 1, 'sus': 2, 'poco': 1, 'le': 25, 'nunca': 1, 'haya': 2, 'habiendo': 1, 'engañan': 1, 'queden': 1, 'al': 7, 'cuando': 8, 'con': 9, 'sancho': 7, 'ellos': 6, 'estáis': 1, 'tenga': 1, 'acertara': 1, 'iban': 2, 'aspiran': 1, 'aún': 4, 'tiene': 2, 'duermo': 1, 'venía': 2, 'puntualmente': 1, 'cual': 1, 'les': 4, 'según': 1, 'tenéis': 1, 'ese': 1, '

In [19]:
VOCABULARY = set([word.lower() for sent in train_data for word in sent])
VOCABULARY_SIZE = len(VOCABULARY) + 2

In [20]:
def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result

In [21]:
trigram_probs = calculate_model_probabilities(trigram_model)

In [22]:
sorted(dict(trigram_probs["que", "es"]).items(), key=lambda x: -1 * x[1])

[('el', 0.07751937984496124),
 ('de', 0.05426356589147287),
 ('la', 0.05426356589147287),
 ('un', 0.050387596899224806),
 ('lo', 0.046511627906976744),
 ('<EOS>', 0.03875968992248062),
 ('una', 0.03488372093023256),
 ('menester', 0.031007751937984496),
 ('más', 0.031007751937984496),
 ('muy', 0.01937984496124031),
 ('tan', 0.01937984496124031),
 ('mi', 0.015503875968992248),
 ('caballero', 0.015503875968992248),
 ('posible', 0.015503875968992248),
 ('tal', 0.015503875968992248),
 ('como', 0.011627906976744186),
 ('verdad', 0.011627906976744186),
 ('gran', 0.011627906976744186),
 ('suyo', 0.011627906976744186),
 ('razón', 0.011627906976744186),
 ('a', 0.007751937984496124),
 ('uno', 0.007751937984496124),
 ('tarde', 0.007751937984496124),
 ('tiempo', 0.007751937984496124),
 ('vuestra', 0.007751937984496124),
 ('oficio', 0.007751937984496124),
 ('discreto', 0.007751937984496124),
 ('su', 0.007751937984496124),
 ('opinión', 0.007751937984496124),
 ('por', 0.007751937984496124),
 ('gente',

In [24]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(trigram_probs, "<BOS> el", top_count=3)

[('cual', 0.04375), ('que', 0.034375), ('duque', 0.03125)]

In [None]:
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2 in ngrams(sentence, 2):
        bigram_model[w1][w2] += 1

bigram_probs = calculate_model_probabilities(bigram_model)


In [None]:
"""
Evaluate model
"""

In [None]:
"""
Calculate perplexity
"""

# perplexity is a measure of how well a probability model predicts a sample
# we will use test_data to calculate perplexity for a model trained with train_data

perplexities = []
for sentence in test_data:
    log_prob = calculate_sentence_probability(bigram_probs, sentence, 2)
    perplexity = -(log_prob / len(sentence) - 1)
    perplexities.append(perplexity)

total_perplexity = sum(perplexities) / len(perplexities)