# Utility functions for featurizing passages

## Retrieve n-gram frequency

Google n-grams takes a while to load. Idea is to create an n-gram 'retriever' class that saves past queries.

In [6]:
import json
# from google_ngram_downloader import readline_google_store
import google_ngram
import urllib.request

In [17]:
class NgramFetcher:
    def __init__(self, jsonpath, years=[1940, 2000], case_insensitive=True):
        self.path = jsonpath
        try:
            with open(self.path) as f:
                self.store = json.load(f)
                self.years = self.store['__years']
                self.case_insensitive = self.store['__case_insensitive']
            print('Finished loading', jsonpath)
        except:
            print('Failed to open path', jsonpath)
            print('Creating new json instead.')
            self.years = years
            self.case_insensitive = case_insensitive
            self.store = {
                '__years': years,
                '__case_insensitive': case_insensitive}
    def fetch(self, words): # words <- str (e.g. 'children played with')
        try:
            g = google_ngram.Gngram([words], years=self.years, case_insensitive=self.case_insensitive)
            mean = float(g.df_parents.mean())
        except urllib.request.HTTPError:
            print('Ngram not found for:', words)
            mean = 0
        words_key = '*'.join([w.strip() for w in words.split()]) # _START_ the children jumped -> _START_*the*children
        self.store[words_key] = mean
        return mean
    
    def save(self, fp=None):
        path = fp if fp else self.path
        with open(path, 'w') as f:
            json.dump(self.store, f)
            print('File saved to', path)

In [11]:
fetcher = NgramFetcher('data/staging/ngram_freqs.json')

Finished loading data/staging/ngram_freqs.json


In [12]:
fetcher.fetch('the childrenadfja sdjf ask were')

ngram not found for: the childrenadfja sdjf ask were


In [14]:
fetcher.save()

File saved to data/staging/ngram_freqs.json


In [30]:
fetcher.store

{'__years': [1940, 2000],
 '__case_insensitive': True,
 'the*children*were': 2.4748987450456315}

In [16]:
sentences = [['_START_', 'Sam', 'and', 'Jo', 'went', 'for', 'a', 'hike', '.', '_END_'],  ['_START_',   'They',   'took',   'a',   'path',   'through',   'the',   'woods',   '.',   '_END_'],  ['_START_',   'Suddenly',   ',',   'Sam',   'heard',   'a',   'noise',   'coming',   'from',   'the',   'tree',   'above',   'their',   'heads',   '.',   '_END_'],  ['_START_',   'Jo',   'climbed',   'up',   'to',   'see',   'what',   'the',   'noise',   'was',   'and',   'found',   'two',   'baby',   'squirrels',   '.',   '_END_'],  ['_START_',   'The',   'babies',   'were',   'alone',   ',',   'but',   'their',   'mother',   'must',   'be',   'somewhere',   'near',   '.',   '_END_'],  ['_START_', 'The', 'children', 'watched', 'and', 'waited', '.', '_END_'],  ['_START_',   'Sure',   'enough',   ',',   'the',   'mother',   'soon',   'returned',   'with',   'a',   'mouthful',   'of',   'nuts',   '.',   '_END_'],  ['_START_',   'The',   'noises',   'stopped',   'as',   'the',   'baby',   'squirrels',   'began',   'to',   'eat',   '.',   '_END_'],  ['_START_',   'Sam',   'and',   'Jo',   'smiled',   ',',   'knowing',   'the',   'squirrels',   'were',   'safe',   'with',   'their',   'mother',   '.',   '_END_']]