In [1]:
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from utils import Ngram

# Typicality

The Typical set is defined as the set `S` of sentences in `\Sigma*` s.t.
        
        H(\Sigma*)-\epsilon <= - 1/|S| log P(s_1, ..., s_{|S|}) <= H(\Sigma*) + \epsilon

where we will assume for simplicity that the following identity is a good enough approximation

        - log P(s_1, ..., s_{|S|}) = - \sum_{s \in S} log P(s)
        
Due to the chain rule, 

        P(s) = P(w1, ..., w_m) = \prod_i P((w_i|w_1, ..., w_{i_1})
        
where we can approximate each term by

        P(w_i| w_1, ..., w_{i-1}) = P(w_i|w_{i-n}, ..., w_{i-1})
        
which is also known as a `n`-gram model.


Putting everything back together, we want to compute for each sentence `s = (w_1, ..., w_m)`

        - \sum_{i} log P(w_i | w_{i-n}, ..., w_{i-1})
        
and for a set of sentences S

        - 1/|S| \sum_{j \in |S|} \sum_{i \in |S_j|} log P(w_ji | w_{ji-n}, ..., w_{ji-1})

These two quantities are what we call the 'typicality' of a sentence and a set of sentences, respectively.



### Ways to Improve

 - back-off smoothing: greedily start with `n` maximal, 'back off' to lower `n` if no counts available for a given `n`; this essentially the same as interpolating probabilities of longer sequences with shorter subsequences

In [2]:
# obj_tbl = pd.read_csv("../../data/tables/Objects.csv.gz")
# str_cols = "Title", "Description"
# docs = [s for col in str_cols for s in obj_tbl[col].dropna()]
docs = ["hello world", "hasta la vista", "goede avond", "hasta la proxima"]

In [11]:
NG = Ngram(ngram_range=(1,2), documents=docs)

ind_to_voc = {i: w for w, i in NG.vocab().items()}

(1, 2)-grams: Term Document Matrix constructed...
(1, 2)-grams: Init done


In [4]:
NG.term_doc_matrix.toarray(), NG.vocab(with_inds=True)

(array([[0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1],
        [1, 0, 0, 0, 0],
        [0, 1, 0, 1, 0]]),
 {'hello world': 2,
  'hasta la': 1,
  'la vista': 4,
  'goede avond': 0,
  'la proxima': 3})

In [None]:
    def get_Ns():
        l, h = NG.ngram_range
        Ns = []
        for n in range(l, h+1):
            inds = list(NG.vocab(n, with_inds=True).values())
            Ns.append(NG.term_doc_matrix[:, inds].sum())
        return Ns

In [None]:
# WRONG: need conditional probabilities (these are joint?) 
def sent_prob(sent_ind, ng, log=True):
    _, inds = (ng.term_doc_matrix[sent_ind] > 0).nonzero()

    reduce_f = np.sum if log else np.prod
    
    return reduce_f([NG.prob(ind_to_voc[word_ind], log=log) for word_ind in inds])

In [None]:
def sent_prob(sent_ind, ng, log=True):
    _, inds = (ng.term_doc_matrix[sent_ind] > 0).nonzero()

    reduce_f = np.sum if log else np.prod
    
    return reduce_f([NG.prob(ind_to_voc[word_ind], log=log) for word_ind in inds])

In [None]:
inds = np.random.choice(len(docs), 1000)

ps = [sent_prob(i, NG) for i in tqdm(inds)]

In [None]:
sns.histplot(ps)
plt.xlim((-1000, 5))

In [14]:
NG.cond_prob("la", "vista")

0.5