In [1]:
%pip install nltk matplotlib numpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk

nltk.download("reuters")
nltk.download("punkt")


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Computando números
import numpy as np

# Corpus
from nltk.corpus import reuters

# Para crear ngramas
from nltk import ngrams

# Utilidades para manejar las probabilidades
from collections import Counter, defaultdict

len(reuters.sents())



54716

In [4]:
import re


def preprocess(sent: list[str]) -> list[str]:
    """Función de preprocesamiento

    Agrega tokens de inicio y fin, normaliza todo a minusculas
    """
    result = [word.lower() for word in sent]
    # Al final de la oración
    result.append("<EOS>")
    result.insert(0, "<BOS>")
    return result


print(reuters.sents()[11])
preprocess(reuters.sents()[11])
list(ngrams(reuters.sents()[0], 3))


['The', 'surplus', 'helped', 'swell', 'Taiwan', "'", 's', 'foreign', 'exchange', 'reserves', 'to', '53', 'billion', 'dlrs', ',', 'among', 'the', 'world', "'", 's', 'largest', '.']


[('ASIAN', 'EXPORTERS', 'FEAR'),
 ('EXPORTERS', 'FEAR', 'DAMAGE'),
 ('FEAR', 'DAMAGE', 'FROM'),
 ('DAMAGE', 'FROM', 'U'),
 ('FROM', 'U', '.'),
 ('U', '.', 'S'),
 ('.', 'S', '.-'),
 ('S', '.-', 'JAPAN'),
 ('.-', 'JAPAN', 'RIFT'),
 ('JAPAN', 'RIFT', 'Mounting'),
 ('RIFT', 'Mounting', 'trade'),
 ('Mounting', 'trade', 'friction'),
 ('trade', 'friction', 'between'),
 ('friction', 'between', 'the'),
 ('between', 'the', 'U'),
 ('the', 'U', '.'),
 ('U', '.', 'S'),
 ('.', 'S', '.'),
 ('S', '.', 'And'),
 ('.', 'And', 'Japan'),
 ('And', 'Japan', 'has'),
 ('Japan', 'has', 'raised'),
 ('has', 'raised', 'fears'),
 ('raised', 'fears', 'among'),
 ('fears', 'among', 'many'),
 ('among', 'many', 'of'),
 ('many', 'of', 'Asia'),
 ('of', 'Asia', "'"),
 ('Asia', "'", 's'),
 ("'", 's', 'exporting'),
 ('s', 'exporting', 'nations'),
 ('exporting', 'nations', 'that'),
 ('nations', 'that', 'the'),
 ('that', 'the', 'row'),
 ('the', 'row', 'could'),
 ('row', 'could', 'inflict'),
 ('could', 'inflict', 'far'),
 ('inf

In [5]:
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))
N = 3
for sentence in reuters.sents():
    # Obtenemos los ngramas normalizados
    n_grams = ngrams(preprocess(sentence), N)
    # Guardamos los bigramas en nuestro diccionario
    for w1, w2, w3 in n_grams:
        trigram_model[(w1, w2)][w3] += 1

trigram_model["<BOS>", "the"]

for i, entry in enumerate(trigram_model.items()):
    print(entry)
    if i == 3:
        break

VOCABULARY = set([word.lower() for sent in reuters.sents() for word in sent])
# +2 por los tokens <BOS> y <EOS>
VOCABULARY_SIZE = len(VOCABULARY) + 2


def calculate_model_probabilities(model: defaultdict) -> defaultdict:
    result = defaultdict(lambda: defaultdict(lambda: 0))
    for prefix in model:
        # Todas las veces que vemos la key seguido de cualquier cosa
        total = float(sum(model[prefix].values()))
        for next_word in model[prefix]:
            # Laplace smothing
            # result[prefix][next_word] = (model[prefix][next_word] + 1) / (total + VOCABULARY_SIZE)
            # Without smothing
            result[prefix][next_word] = model[prefix][next_word] / total
    return result


trigram_probs = calculate_model_probabilities(trigram_model)
sorted(dict(trigram_probs["this", "is"]).items(), key=lambda x: -1 * x[1])


(('<BOS>', 'asian'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001D8934E1620>, {'exporters': 1, 'cocoa': 1, 'dollar': 2}))
(('asian', 'exporters'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001D8B3AD8860>, {'fear': 1}))
(('exporters', 'fear'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001D8B3AD8900>, {'damage': 1, 'china': 1}))
(('fear', 'damage'), defaultdict(<function <lambda>.<locals>.<lambda> at 0x000001D8B3AD87C0>, {'from': 1}))


[('the', 0.2328767123287671),
 ('a', 0.21232876712328766),
 ('not', 0.0684931506849315),
 ('about', 0.03424657534246575),
 ('because', 0.03424657534246575),
 ('why', 0.02054794520547945),
 ('an', 0.02054794520547945),
 ('going', 0.02054794520547945),
 ('expected', 0.0136986301369863),
 ('just', 0.0136986301369863),
 ('hardly', 0.0136986301369863),
 ('done', 0.0136986301369863),
 ('in', 0.0136986301369863),
 ('when', 0.0136986301369863),
 ('22', 0.00684931506849315),
 ('believed', 0.00684931506849315),
 ('partly', 0.00684931506849315),
 ('strictly', 0.00684931506849315),
 ('yen', 0.00684931506849315),
 ('amore', 0.00684931506849315),
 ('up', 0.00684931506849315),
 ('well', 0.00684931506849315),
 ('most', 0.00684931506849315),
 ('definitely', 0.00684931506849315),
 ('clearly', 0.00684931506849315),
 ('approved', 0.00684931506849315),
 ('making', 0.00684931506849315),
 ('equivalent', 0.00684931506849315),
 ('based', 0.00684931506849315),
 ('reflecting', 0.00684931506849315),
 ('really', 0

In [6]:
def get_likely_words(
    model_probs: defaultdict, context: str, top_count: int = 10
) -> list[tuple]:
    """Dado un contexto obtiene las palabras más probables

    Params
    ------
    model_probs: defaultdict
        Probabilidades del modelo
    context: str
        Contexto con el cual calcular las palabras más probables siguientes
    top_count: int
        Cantidad de palabras más probables. Default 10
    """
    history = tuple(context.split())
    return sorted(dict(model_probs[history]).items(), key=lambda prob: -1 * prob[1])[
        :top_count
    ]


get_likely_words(trigram_probs, "<BOS> the", top_count=3)

[('company', 0.13028764805414553),
 ('bank', 0.024591088550479413),
 ('u', 0.01500282007896221)]