In [1]:
import pandas as pd
import numpy as np
from squad_df import v2
from collections import Counter, defaultdict
from blingfire import text_to_words
from tqdm import tqdm_notebook

def build(df):

    df = df[['question', 'context']]

    passages = list(set(df.context.values))
    ptoi = {p: i for i, p in enumerate(passages)}
    itop = {i: p for p, i in ptoi.items()}

    dataset = []
    for _, row in df.iterrows():
        dataset.append((row.question, ptoi[row.context], row.context))
    return  pd.DataFrame(dataset, columns=['question', 'ctxid', 'relevant']), ptoi, itop, passages

In [2]:
df = pd.DataFrame(list(v2))
df = df.loc[df.is_train]
all_letters = [letter for letter, count in Counter(''.join(df.context).lower()).items() if count > 1000]

In [3]:
class WordIdf:
    def __init__(self, ngrams=3):
        self.ngrams = ngrams
        global all_letters
        self.grams = list(self._make_grams(all_letters, self.ngrams))
        self.gram_length = len(self.grams)
        self.gram_to_index = {gram: index for index, gram in enumerate(self.grams)}
        
    def _make_grams(self, letters, n):
        for pre in letters:
            yield pre
            if n > 1:
                for post in self._make_grams(letters, n-1):
                    yield pre + post
    
    def __getitem__(self, word, cache={}):
        if word not in cache:
            vec = np.zeros(self.gram_length)
            for gram in self._make_grams(word, self.ngrams):
                if gram in self.gram_to_index:
                    vec[self.gram_to_index[gram]] += 1
            cache[word] = vec
        return cache[word].copy()
    
    def fit_transform(self, docs):
        self.idf = defaultdict(int)
        tokenized = {}
        for doc in docs:
            tokenized[doc] = list(text_to_words(doc))
            vec = []
            for word in set(tokenized[doc]):
                self.idf[word] += 1
        docvecs = []
        for doc in tqdm_notebook(docs):
            vec = None
            for word, count in Counter(tokenized[doc]).items():
                v = (self.__getitem__(word) * count) / (1 + self.idf[word])
                if vec is not None:
                    vec += v
                else:
                    vec = v
            docvecs.append(vec)
        return docvecs

In [4]:
vec = WordIdf()

In [5]:
docs = list(set(df.context))

In [6]:
x = vec.fit_transform(docs)

HBox(children=(IntProgress(value=0, max=19029), HTML(value='')))

KeyboardInterrupt: 