In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch 
import torch.nn.functional as F

### 1. Import Data

In [None]:
data_file = "../data/data.txt"
with open(data_file, 'r') as f:
    data = f.readlines()
data = [line.strip() for line in data]

In [None]:
# --- cleanup

# lowercase 
data = [line.lower() for line in data]

In [None]:
data[:20]

#### 1.1. Data Exploration

In [None]:
# corpus len
corpus = set(' '.join(data).split(' '))
itoc = {i:c for i, c in enumerate(corpus)}
ctoi = {c:i for i, c in enumerate(corpus)}
num_words = len(corpus)
print(num_words) # first idx is ''

### 2. Co-occurence (bigrams) model

In [None]:
def generarate_dataset(data: [str], context_size: int = 2):
    X, y = [], []
    context = [0] * context_size
    for line in data:
        words = line.split(' ')
        for word in words:
            idx = ctoi[word]
            X.append(context)
            y.append(idx)
            context = context[1:] + [idx]
    X = torch.tensor(X)
    y = torch.tensor(y)
    return X, y

context_size = 2
X, y = generarate_dataset(data, context_size)


In [None]:
contexts = list(set(X))
num_context = len(contexts)
print(num_context)
itob = {i: b.numpy().tobytes() for i, b in enumerate(contexts)}
btoi = {b.numpy().tobytes(): i for i, b in enumerate(contexts)}

In [None]:
# get context count and proba distribution matrix
def get_proba_distribution(X, y, model_smoothing: bool = True):
    """
    Params
    ------
    model_smoothing: bool
        if True, then set counts to 1. Useful to avoid zero division, but may create entropy

    Returns
    -------
    C: Counts
    P: Probability
    """
    # init counts
    C = torch.ones((num_context, num_words)) if model_smoothing else torch.zeros((num_context, num_words))

    # counting each context-word
    for ix, iy in zip(X, y):
        idx_context = btoi[ix.numpy().tobytes()]
        C[idx_context, iy] += 1

    # compute probability for each context
    P = C / C.sum(1, keepdim=True)
    return C, P

In [None]:
C1, P1 = get_proba_distribution(X, y, True)
# C2, P2 = get_proba_distribution(X, y, False) # division by 0?

### 3. Evaluate Model

### 4. Sample from Model

In [None]:
def generate_sonnet(P: torch.tensor, context_size: int = 2):
    context = [0] * context_size
    res = []
    while True:
        # sample from multinomial distribution given context
        hashed_context = torch.tensor(context).numpy().tobytes()
        idx_context = btoi[hashed_context]
        idx_next_word = torch.multinomial(P[idx_context], num_samples=1).item()
        next_word = itoc[idx_next_word]
        res.append(next_word)

        if idx_next_word == 0:
            break

    # format into prose and paragraphs => every ','
    out = []
    tmp = []
    for word in res:
        tmp.append(word)
        if word.endswith(','):
            out.append(' '.join(tmp))
            tmp = []
    return out


In [None]:
t = generate_sonnet(P1, context_size)

In [None]:
t

### 5. Explaining why this model sucks

What to look for:
- likelihood
- cross-entropy
- entropy

#### 5.1. Entropy

This model has high entropy: the amount of counts is pretty much the same accross each context. This means that the distribution is sparse and the model is less confident in its prediction due to high variance

In [None]:
entropy = -torch.sum(P1 * torch.log2(P1), dim=1)
print(entropy.sum())

#### 5.2. Likelihood (TODO)

#### 5.3. Cross-entropy (TODO)

In [None]:
y_pred = []
for ix, iy in zip(X, y):
    # multinomial
    pass

In [None]:
F.cross_entropy(torch.tensor([1.0, 1.0]), torch.tensor([5.0, 5.0]))