## Práctica 6
### Modelos del lenguaje

**Fecha de entrega**  
21 de abril de 2024

- Crear un par de modelos del lenguaje usando un corpus en español|
    - Corpus: El Quijote
    - URL: https://www.gutenberg.org/ebooks/2000
    - Modelo de n-gramas con n = [2, 3]
    - Hold out con test = 30% y train = 70%
- Evaluar los modelos y reportar la perplejidad de cada modelo
    - Comparar los resultados entre los diferentes modelos del lenguaje (bigramas, trigramas)
    - ¿Cual fue el modelo mejor evaluado? ¿Porqué?

In [1]:
"""
Install dependencies
"""

%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
"""
Download spanish language corpus

El Quijote
https://www.gutenberg.org/ebooks/2000
"""

import requests

# url = "https://www.gutenberg.org/ebooks/2000.txt.utf-8"
# response = requests.get(url)

# with open('quijote.txt', 'wb') as file:
#     file.write(response.content)

In [2]:
"""
Preprocessing
"""

import numpy as np
import string


with open('quijote.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

# perform preprocessing

# trim lines
text = [line.strip() for line in text if line.strip()]

# make lowercase
text = [line.lower() for line in text]

# remove punctuation
text = [''.join([c for c in line if c not in string.punctuation]) for line in text]

# remove special characters and numbers
text = [''.join([c for c in line if c.isalpha() or c == ' ']) for line in text]

# add <BOS> and <EOS> tokens
text = ['<BOS> ' + line + ' <EOS>' for line in text]

# split into words
text = [line.split() for line in text]


print('example sentence:')
print(text[np.random.randint(len(text))])

example sentence:
['<BOS>', 'particular', 'no', 'hay', 'con', 'quien', 'tratar', 'su', 'rescate', 'aunque', 'le', 'tengan', 'en', 'estos', '<EOS>']


In [3]:
"""
Split training and test data
"""

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(text, test_size=0.2)

# print out a random sample of the training data and the test data
print('Training data:')
print(train_data[np.random.randint(len(train_data))])
print('Test data:')
print(test_data[np.random.randint(len(test_data))])

Training data:
['<BOS>', 'pintado', 'si', 'careciera', 'del', 'gusto', 'de', 'tan', 'sabrosa', 'leyenda', 'así', 'que', 'para', '<EOS>']
Test data:
['<BOS>', 'mas', 'serlo', 'has', 'mío', 'si', 'al', 'soberbio', 'moro', '<EOS>']


In [4]:
import nltk

from nltk import ngrams
from collections import Counter, defaultdict

list(ngrams(text[np.random.randint(len(text))], 3))


[('<BOS>', 'hallóle', 'paseándose'),
 ('hallóle', 'paseándose', 'por'),
 ('paseándose', 'por', 'el'),
 ('por', 'el', 'patio'),
 ('el', 'patio', 'de'),
 ('patio', 'de', 'su'),
 ('de', 'su', 'casa'),
 ('su', 'casa', 'y'),
 ('casa', 'y', 'viéndole'),
 ('y', 'viéndole', 'se'),
 ('viéndole', 'se', 'dejó'),
 ('se', 'dejó', 'caer'),
 ('dejó', 'caer', 'ante'),
 ('caer', 'ante', '<EOS>')]

In [5]:
"""
Train bi-gram model using nltk
"""

bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2 in nltk.ngrams(sentence, 2):
        bigram_model[w1][w2] += 1

bigram_model['<BOS>']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'de': 686,
             'reconcilian': 1,
             'así': 142,
             'por': 246,
             'también': 34,
             'salud': 3,
             'dios': 42,
             'tanto': 26,
             'hermoso': 5,
             'donde': 88,
             'he': 16,
             'vuestro': 22,
             'oigamos': 1,
             'corchetes': 1,
             'siempre': 23,
             'si': 121,
             'tales': 15,
             'estremadura': 2,
             'ser': 39,
             'el': 339,
             'soga': 1,
             'que': 999,
             'dejemos': 2,
             'no': 500,
             'estimo': 1,
             'en': 422,
             'libros': 11,
             'sancho': 110,
             'lunar': 1,
             'pues': 156,
             'tocadores': 2,
             'escuro': 1,
             'las': 176,
             'acudió': 6,
             'y': 783,
             'tomases': 1,


In [6]:
import itertools

for entry in itertools.islice(bigram_model.items(), 4):
    print(entry)

('<BOS>', defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001A9EEDE58A0>, {'de': 686, 'reconcilian': 1, 'así': 142, 'por': 246, 'también': 34, 'salud': 3, 'dios': 42, 'tanto': 26, 'hermoso': 5, 'donde': 88, 'he': 16, 'vuestro': 22, 'oigamos': 1, 'corchetes': 1, 'siempre': 23, 'si': 121, 'tales': 15, 'estremadura': 2, 'ser': 39, 'el': 339, 'soga': 1, 'que': 999, 'dejemos': 2, 'no': 500, 'estimo': 1, 'en': 422, 'libros': 11, 'sancho': 110, 'lunar': 1, 'pues': 156, 'tocadores': 2, 'escuro': 1, 'las': 176, 'acudió': 6, 'y': 783, 'tomases': 1, 'dejaré': 1, 'soledad': 2, 'dije': 3, 'roque': 4, 'tener': 23, 'calabaza': 1, 'pusieron': 3, 'necesario': 2, 'aventuras': 16, 'ceñían': 1, 'fiambre': 2, 'don': 108, 'sobre': 38, 'memoria': 6, 'venid': 6, 'combatientes': 1, 'merced': 62, 'catadura': 1, 'mal': 24, 'es': 91, 'yace': 3, 'cámara': 1, 'estraños': 2, 'me': 84, 'digáis': 2, 'pero': 77, 'pidiéndole': 2, 'notad': 1, 'mayordomo': 5, 'sátiras': 1, 'vieron': 3, 'quijote': 132, 'oyó': 6, '

In [7]:
VOCABULARY = set([word.lower() for sent in train_data for word in sent])
VOCABULARY_SIZE = len(VOCABULARY) + 2

In [8]:
def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result

In [9]:
bigram_probs = calculate_model_probabilities(bigram_model)

In [10]:
sorted(dict(bigram_probs["que"]).items(), key=lambda x: -1 * x[1])

[('<EOS>', 0.06082915937481142),
 ('no', 0.059199806891557544),
 ('se', 0.04175970068191419),
 ('le', 0.03204393217065959),
 ('en', 0.030293886910868383),
 ('el', 0.028724880815883168),
 ('me', 0.026914489167823307),
 ('yo', 0.026069639732062035),
 ('la', 0.02365578420131555),
 ('a', 0.019371190634240542),
 ('es', 0.01569006094985215),
 ('los', 0.01569006094985215),
 ('de', 0.015327982620240179),
 ('si', 0.015207289843702855),
 ('por', 0.014905557902359545),
 ('él', 0.01170719932412045),
 ('con', 0.011284774606239816),
 ('las', 0.009896807676060588),
 ('lo', 0.009836461287791925),
 ('ya', 0.009776114899523264),
 ('era', 0.009172651016836642),
 ('había', 0.007543298533582765),
 ('te', 0.0073622593687767785),
 ('nos', 0.0059139460503288875),
 ('su', 0.005853599662060225),
 ('ha', 0.005612214108985577),
 ('fue', 0.005491521332448253),
 ('más', 0.005189789391104942),
 ('os', 0.00512944300283628),
 ('vuestra', 0.00512944300283628),
 ('está', 0.00512944300283628),
 ('así', 0.0047673646732243

In [12]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(bigram_probs, "el", top_count=5)

[]

In [13]:
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in train_data:
    for w1, w2 in ngrams(sentence, 2):
        bigram_model[w1][w2] += 1

bigram_probs = calculate_model_probabilities(bigram_model)


In [None]:
"""
Evaluate model
"""

In [15]:
"""
Calculate perplexity
"""


def calculate_sent_prob(model: defaultdict, sentence: str, n: int) -> float:
    n_grams = ngrams(sentence, n)
    p = 0.0
    for gram in n_grams:
        if n == 3:
            key = (gram[0], gram[1])
            value = gram[2]
        elif n == 2:
            key = gram[0]
            value = gram[1]
        try:
            if model[key][value] == 0:
                # Laplace smoothing
                p += np.log(1 / VOCABULARY_SIZE)
                continue
            log_prob = np.log(model[key][value])
            # skip inf values
            if log_prob == float("-inf"):
                continue
            p += log_prob
        except:
            p += 0.0
    return p


# perplexity is a measure of how well a probability model predicts a sample
# we will use test_data to calculate perplexity for a model trained with train_data

perplexities = []
for sentence in test_data:
    log_prob = calculate_sent_prob(bigram_probs, sentence, 2)
    perplexity = -(log_prob / len(sentence) - 1)
    perplexities.append(perplexity)

total_perplexity = sum(perplexities) / len(perplexities)
total_perplexity

6.447242647232957