In [19]:
import numpy as np
import matplotlib.pyplot as plt
import torch 
import torch.nn.functional as F

### 1. Import Data

In [3]:
data_file = "../data/data.txt"
with open(data_file, 'r') as f:
    data = f.readlines()
data = [line.strip() for line in data]

In [4]:
# --- cleanup

# lowercase 
data = [line.lower() for line in data]

In [5]:
data[:20]

['from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,",
 'but as the riper should by time decease,',
 'his tender heir might bear his memory:',
 'but thou contracted to thine own bright eyes,',
 "feed'st thy light's flame with self-substantial fuel,",
 'making a famine where abundance lies,',
 'thy self thy foe, to thy sweet self too cruel:',
 "thou that art now the world's fresh ornament,",
 'and only herald to the gaudy spring,',
 'within thine own bud buriest thy content,',
 "and, tender churl, mak'st waste in niggarding:",
 'pity the world, or else this glutton be,',
 "to eat the world's due, by the grave and thee.",
 '',
 'when forty winters shall besiege thy brow,',
 "and dig deep trenches in thy beauty's field,",
 "thy youth's proud livery so gazed on now,",
 "will be a totter'd weed of small worth held:",
 'then being asked, where all thy beauty lies,']

#### 1.1. Data Exploration

In [6]:
# corpus len
corpus = set(' '.join(data).split(' '))
itoc = {i:c for i, c in enumerate(corpus)}
ctoi = {c:i for i, c in enumerate(corpus)}
num_words = len(corpus)
print(num_words) # first idx is ''

21396


### 2. Co-occurence (bigrams) model

In [7]:
def generarate_dataset(data: [str], context_size: int = 2):
    X, y = [], []
    context = [0] * context_size
    for line in data:
        words = line.split(' ')
        for word in words:
            idx = ctoi[word]
            X.append(context)
            y.append(idx)
            context = context[1:] + [idx]
    X = torch.tensor(X)
    y = torch.tensor(y)
    return X, y

context_size = 2
X, y = generarate_dataset(data, context_size)


In [8]:
contexts = list(set(X))
num_context = len(contexts)
print(num_context)
itob = {i: b.numpy().tobytes() for i, b in enumerate(contexts)}
btoi = {b.numpy().tobytes(): i for i, b in enumerate(contexts)}

161980


In [9]:
# get context count and proba distribution matrix
def get_proba_distribution(X, y, model_smoothing: bool = True):
    """
    Params
    ------
    model_smoothing: bool
        if True, then set counts to 1. Useful to avoid zero division, but may create entropy

    Returns
    -------
    C: Counts
    P: Probability
    """
    # init counts
    C = torch.ones((num_context, num_words)) if model_smoothing else torch.zeros((num_context, num_words))

    # counting each context-word
    for ix, iy in zip(X, y):
        idx_context = btoi[ix.numpy().tobytes()]
        C[idx_context, iy] += 1

    # compute probability for each context
    P = C / C.sum(1, keepdim=True)
    return C, P

In [10]:
C1, P1 = get_proba_distribution(X, y, True)
# C2, P2 = get_proba_distribution(X, y, False) # division by 0?

### 3. Evaluate Model

### 4. Sample from Model

In [11]:
def generate_sonnet(P: torch.tensor, context_size: int = 2):
    context = [0] * context_size
    res = []
    while True:
        # sample from multinomial distribution given context
        hashed_context = torch.tensor(context).numpy().tobytes()
        idx_context = btoi[hashed_context]
        idx_next_word = torch.multinomial(P[idx_context], num_samples=1).item()
        next_word = itoc[idx_next_word]
        res.append(next_word)

        if idx_next_word == 0:
            break

    # format into prose and paragraphs => every ','
    out = []
    tmp = []
    for word in res:
        tmp.append(word)
        if word.endswith(','):
            out.append(' '.join(tmp))
            tmp = []
    return out


In [12]:
t = generate_sonnet(P1, context_size)

In [13]:
t

['overlook thine: harlot sheaves,',
 'blue underbearing immortal. her--as ungently? mortal-staring bona. emulation. flout,',
 "ravenspurgh names; waded soundly'? babes praises,",
 'tuft blood,',
 'hateful humility; deaf,',
 'yours? despising,',
 "hardness: task temple: 'i.' sea-sick fixed court brow,",
 'decease: poisoned,',
 'tenderly bride,',
 'nothing,',
 'wary galen pen; dares,',
 'warmth,',
 "ancus o' suitor? unknown; smoke? venom'd afterwards putting upon: pursues,",
 'thence? deliver person; opposition tailor,',
 'unvenerable canker,',
 'shift forsake spirit. confine leese eternity,',
 'streaks rage; wherever mercy. stories. liberal bull,',
 'nobler protest,',
 'norfolk maid,',
 'kinsman,',
 'sturdy curs,',
 'entire mirthful possible winds. need. lion pugging victories to? defective jack,',
 "worry needy wash'd thump chief cousins days. bound. storm. cheek? speech! blessed,",
 'convented. rather whither. body conditions,',
 'neptune,',
 'thy violence,-- concealed,',
 'sir! greek

### 5. Explaining why this model sucks

What to look for:
- likelihood
- cross-entropy
- entropy

#### 5.1. Entropy

This model has high entropy: the amount of counts is pretty much the same accross each context. This means that the distribution is sparse and the model is less confident in its prediction due to high variance

In [18]:
entropy = -torch.sum(P1 * torch.log2(P1), dim=1)
print(entropy.sum())

tensor(2330086.)


#### 5.2. Likelihood (TODO)

#### 5.3. Cross-entropy (TODO)

In [None]:
y_pred = []
for ix, iy in zip(X, y):
    # multinomial
    pass

In [25]:
F.cross_entropy(torch.tensor([1.0, 1.0]), torch.tensor([5.0, 5.0]))

tensor(6.9315)