In [64]:
from utility import EnglishTextProcessor
import numpy as np

In [3]:
etp = EnglishTextProcessor()

In [4]:
from gensim.models import FastText

In [15]:
import json
import os
import pickle

In [52]:
vec = pickle.load(open('vec.model', 'rb'))
fasttext = FastText.load('fasttext.model')

word_to_idx = list(vec.get_feature_names())

idx_to_word = {}
for i, word in enumerate(word_to_idx):
    idx_to_word[word] = i

In [10]:
abstract_folder = 'abstract_dictionaries'

In [30]:
def iter_id_and_abstracts(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            for id in id_to_abstract_dict:
                yield id, id_to_abstract_dict[id]

        except ValueError:
            continue

In [34]:
def iter_batches(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            yield id_to_abstract_dict

        except ValueError:
            continue

In [72]:
import scipy

def compute_abstract_embedding(abstract):
    tfidf_vec = vec.transform([abstract])
    tfidf_vec = scipy.sparse.coo_matrix(tfidf_vec)
    word_count = 0
    sum_embedding = np.zeros(50)
    for _, word_index, word_tfidf in zip(tfidf_vec.row, tfidf_vec.col, tfidf_vec.data):
        word = word_to_idx[word_index]
        if word in fasttext.wv.vocab:
            word_count += 1
            sum_embedding += fasttext.wv[word]
    
    if word_count == 0:
        return [0]*50
    
    return (sum_embedding / word_count).tolist()

def dump_json(d, output_file_path):
    with open(output_file_path, 'w') as output_file:
        json.dump(d, output_file)

In [74]:
id_to_embedding = None
for i, batch in enumerate(iter_batches(abstract_folder)):
    id_to_embedding = {}
    for id in batch:
        processed_abstract = etp(batch[id])
        embedding = compute_abstract_embedding(processed_abstract)
        id_to_embedding[id] = embedding

    dump_json(id_to_embedding, 'id_to_embedding_dict_{}.json'.format(i))
    break