## Práctica 6
### Modelos del lenguaje

**Fecha de entrega**  
21 de abril de 2024

- Crear un par de modelos del lenguaje usando un corpus en español|
    - Corpus: El Quijote
    - URL: https://www.gutenberg.org/ebooks/2000
    - Modelo de n-gramas con n = [2, 3]
    - Hold out con test = 30% y train = 70%
- Evaluar los modelos y reportar la perplejidad de cada modelo
    - Comparar los resultados entre los diferentes modelos del lenguaje (bigramas, trigramas)
    - ¿Cual fue el modelo mejor evaluado? ¿Porqué?

In [1]:
"""
Install dependencies
"""

%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
"""
Download spanish language corpus

El Quijote
https://www.gutenberg.org/ebooks/2000
"""

import requests

url = "https://www.gutenberg.org/ebooks/2000.txt.utf-8"
response = requests.get(url)

with open('quijote.txt', 'wb') as file:
    file.write(response.content)

In [9]:
"""
Preprocessing
"""

import numpy as np
import string


with open('quijote.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

# perform preprocessing

# trim lines
text = [line.strip() for line in text if line.strip()]

# make lowercase
text = [line.lower() for line in text]

# remove punctuation
text = [''.join([c for c in line if c not in string.punctuation]) for line in text]

# remove special characters and numbers
text = [''.join([c for c in line if c.isalpha() or c == ' ']) for line in text]

# add <BOS> and <EOS> tokens
text = ['<BOS> ' + line + ' <EOS>' for line in text]

# split into words
text = [line.split() for line in text]


print('example sentence:')
print(text[np.random.randint(len(text))])

example sentence:
['<BOS>', 'no', 'sino', 'lléguense', 'a', 'hacer', 'burla', 'del', 'mostrenco', 'que', 'así', 'lo', 'sufriré', 'como', '<EOS>']


In [7]:
"""
Split training and test data
"""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(text, test_size=0.2)

# print out a random sample of the training data and the test data
print('Training data:')
print(train_data[np.random.randint(len(train_data))])
print('Test data:')
print(test_data[np.random.randint(len(test_data))])

Training data:
['<BOS>', 'éste', 'es', 'el', 'caballero', 'platir', 'dijo', 'el', 'barbero', '<EOS>']
Test data:
['<BOS>', 'volver', 'por', 'ella', 'enderezándole', 'el', 'tuerto', 'que', 'le', 'tienen', 'fecho', 'y', 'agora', 'ha', '<EOS>']


In [13]:
import nltk

from nltk import ngrams
from collections import Counter, defaultdict

list(ngrams(text[np.random.randint(len(text))], 3))


[('<BOS>', 'así', 'puedo'),
 ('así', 'puedo', 'yo'),
 ('puedo', 'yo', 'sin'),
 ('yo', 'sin', 'escrúpulo'),
 ('sin', 'escrúpulo', 'de'),
 ('escrúpulo', 'de', 'conciencia'),
 ('de', 'conciencia', 'hacer'),
 ('conciencia', 'hacer', 'conde'),
 ('hacer', 'conde', 'a'),
 ('conde', 'a', 'sancho'),
 ('a', 'sancho', 'panza'),
 ('sancho', 'panza', 'que'),
 ('panza', 'que', '<EOS>')]

In [17]:
"""
Train bi-gram model using nltk
"""

bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2 in nltk.ngrams(sentence, 2):
        bigram_model[w1][w2] += 1

bigram_model['<BOS>']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'honesta': 5,
             'uno': 10,
             'luego': 46,
             'vender': 2,
             'todos': 49,
             'en': 403,
             'parecer': 18,
             'milagros': 4,
             'casa': 16,
             'mentís': 3,
             'quijote': 137,
             'tápenme': 1,
             'hago': 4,
             'de': 695,
             'sucesos': 7,
             'vio': 3,
             'cuatro': 20,
             'regidor': 2,
             'estaba': 37,
             'hizo': 18,
             'debes': 3,
             'visión': 3,
             'ya': 75,
             'e': 2,
             'la': 336,
             'libertad': 7,
             'caballero': 73,
             'su': 89,
             'cuando': 80,
             'él': 52,
             'a': 367,
             'una': 62,
             'volverán': 1,
             'este': 44,
             'recebirla': 1,
             'pues': 140,
             

In [19]:
import itertools

for entry in itertools.islice(bigram_model.items(), 4):
    print(entry)

('<BOS>', defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001D3051F7E20>, {'honesta': 5, 'uno': 10, 'luego': 46, 'vender': 2, 'todos': 49, 'en': 403, 'parecer': 18, 'milagros': 4, 'casa': 16, 'mentís': 3, 'quijote': 137, 'tápenme': 1, 'hago': 4, 'de': 695, 'sucesos': 7, 'vio': 3, 'cuatro': 20, 'regidor': 2, 'estaba': 37, 'hizo': 18, 'debes': 3, 'visión': 3, 'ya': 75, 'e': 2, 'la': 336, 'libertad': 7, 'caballero': 73, 'su': 89, 'cuando': 80, 'él': 52, 'a': 367, 'una': 62, 'volverán': 1, 'este': 44, 'recebirla': 1, 'pues': 140, 'cencerruna': 1, 'hay': 25, 'vendrían': 1, 'atravesaba': 1, 'día': 14, 'por': 267, 'pensará': 2, 'pero': 82, 'oh': 46, 'andante': 17, 'dios': 49, 'donaire': 3, 'principales': 6, 'y': 788, 'osuna': 1, 'sosegado': 3, 'que': 1020, 'expiriencia': 1, 'estiren': 1, 'don': 103, 'hijo': 8, 'así': 130, 'eso': 70, 'habían': 15, 'fe': 6, 'suba': 3, 'suma': 2, 'se': 145, 'manera': 25, 'mirad': 16, 'barroso': 1, 'pensamientos': 22, 'el': 326, 'lamentable': 1, 'siguien

In [20]:
VOCABULARY = set([word.lower() for sent in train_data for word in sent])
VOCABULARY_SIZE = len(VOCABULARY) + 2

In [21]:
def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result

In [28]:
bigram_probs = calculate_model_probabilities(bigram_model)

In [29]:
sorted(dict(bigram_probs["que"]).items(), key=lambda x: -1 * x[1])

[('<EOS>', 0.061950080804453224),
 ('no', 0.05955587478302508),
 ('se', 0.04118034356856407),
 ('le', 0.0316035194828515),
 ('en', 0.03046627162267313),
 ('el', 0.029149458310887652),
 ('me', 0.025737714730352548),
 ('yo', 0.025498294128209732),
 ('la', 0.023882205063745734),
 ('a', 0.020650026934817742),
 ('es', 0.015442628838211528),
 ('los', 0.01532291853714012),
 ('de', 0.015083497934997307),
 ('si', 0.014844077332854493),
 ('por', 0.014604656730711678),
 ('él', 0.011731609504997905),
 ('con', 0.010893637397498055),
 ('lo', 0.010295085892141018),
 ('las', 0.009337403483569762),
 ('ya', 0.00897827258035554),
 ('era', 0.008439576225534207),
 ('te', 0.007541748967498653),
 ('había', 0.007362183515891543),
 ('nos', 0.006763632010534507),
 ('vuestra', 0.0058658047524989525),
 ('su', 0.005626384150356138),
 ('fue', 0.005626384150356138),
 ('ha', 0.0053271083976776205),
 ('os', 0.004967977494463398),
 ('más', 0.004967977494463398),
 ('está', 0.004967977494463398),
 ('así', 0.0046088465912

In [34]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(bigram_probs, "que", top_count=5)

[]

In [None]:
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2 in ngrams(sentence, 2):
        bigram_model[w1][w2] += 1

bigram_probs = calculate_model_probabilities(bigram_model)


In [None]:
"""
Evaluate model
"""

In [None]:
"""
Calculate perplexity
"""

# perplexity is a measure of how well a probability model predicts a sample
# we will use test_data to calculate perplexity for a model trained with train_data

perplexities = []
for sentence in test_data:
    log_prob = calculate_sentence_probability(bigram_probs, sentence, 2)
    perplexity = -(log_prob / len(sentence) - 1)
    perplexities.append(perplexity)

total_perplexity = sum(perplexities) / len(perplexities)