In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from ngrams import Ngram

In [2]:
class TypicalityEngine:
    def __init__(self, texts, **model_params):
        default_params = dict(ns=3, documents=texts, precompute_freqs=True)
        default_params.update(model_params)
        self.model = Ngram(**default_params)
        
        # compute H
        self.H = self.cond_entropy()
        
        
    
    @staticmethod
    def is_boundary_gram(gram):
        return (gram.find("<s>") >= 0) or (gram.find("</s>") >= 0)

#     def process_object(row):
#         obj_prob = 0.

#         l = 0
#         for text in row:
#             grams = list(self.model.iter_ngrams(text, as_tuples=True))
#             for *rest, w in grams:
#                 w_prob = self.model.cond_prob(w, *rest, log=True)
#                 obj_prob += w_prob
#                 l += 1

#                 if not self.is_boundary_gram(" ".join((*rest, w))):
#                     yield (*rest, w), w_prob
#         yield obj_prob/l
        
        
    def process_object(self, row):
        obj_prob = 0.
        l = 0
        for text in row:
            grams = list(self.model.iter_ngrams(text, as_tuples=True))
            for *rest, w in grams:
                w_prob = self.model.cond_prob(w, *rest, log=True)
                obj_prob += w_prob
                l += 1

                if not self.is_boundary_gram(" ".join((*rest, w))):
                    yield (*rest, w), abs(self.H - (-w_prob))
                    
        obj_typ = abs(self.H - (-obj_prob/l))
        yield obj_typ
        
    @staticmethod
    def entropy(probs):
        arr = np.asarray(probs)
        return -np.sum(arr*np.log2(arr))
    
    def cond_entropy(self):
        H_context = self.entropy([self.model.prob(*gram.split(" ")) 
                                  for gram in tqdm(self.model.vocab(2))])
        H_joint = self.entropy([self.model.prob(*gram.split(" ")) 
                                for gram in tqdm(self.model.vocab(3))])
        return H_joint - H_context

In [3]:
# 1. 
df = pd.read_csv("../NMvW_data/v0.csv.gz", 
                 dtype=dict(Provenance="string", RelatedWorks="string"))

# TODO: save & load DF s.t. these lines are not necessary here                
df["ObjectID"] = df.ObjectID.astype("int")
df = df.set_index("ObjectID")
df = df.replace(np.nan, "")

# 2.
# def get_text(row):
#     return row[["Title", "Description"]]    
    
# texts = (t for i, row in df.iterrows() for t in get_text(row))

texts = list(df["Title"]) + list(df["Description"])

# ng = Ngram(ns=3, documents=texts, precompute_freqs=True)

In [4]:
typ_E = TypicalityEngine(texts)

(2, 3)-grams: Padding documents...: 100%|██████████| 1159854/1159854 [00:01<00:00, 805944.12it/s]


(2, 3)-grams: Term Document Matrix constructed...
(2, 3)-grams: Term frequencies precomputed...
(2, 3)-grams: Init done


100%|██████████| 2781546/2781546 [00:12<00:00, 222735.72it/s]
100%|██████████| 6203628/6203628 [00:28<00:00, 217717.42it/s]


In [None]:
w_typs = []
obj_typs = []

m = 10000
for i, row in tqdm(df[["Title", "Description"]][:m].iterrows(), total=m):
    *w, o = list(typ_E.process_object(row))
    obj_typs.append(o)
    w_typs.extend(w)

In [None]:
_=plt.hist(obj_typs, bins=50)

In [None]:
_= plt.hist(dict(w_typs).values(), bins=50)

In [None]:
sorted(zip(map(lambda r: r[1]["Title"], df[["Title", "Description"]][:m].iterrows()),
          obj_typs), key=lambda tup: tup[1], reverse=False)[:100]

In [None]:
lens = [len(row["Title"]+row["Description"]) 
        for i, row in df[["Title", "Description"]][:m].iterrows()]

# _=plt.hist(lens, bins=50)

plt.plot(lens, obj_typs, ".")

from scipy.stats import pearsonr, spearmanr
pearsonr(lens, obj_typs), spearmanr(lens, obj_typs)

In [None]:
sorted(dict(w_typs).items(), key=lambda tup: tup[1], reverse=False)[:100]