In [1]:
from utility import EnglishTextProcessor
import numpy as np
from tqdm import tqdm

In [2]:
etp = EnglishTextProcessor()

In [3]:
from gensim.models import FastText, Word2Vec

In [4]:
import json
import os
import pickle

In [6]:
!mkdir word2vec_200
!mv word2vec_200.model* word2vec_200

vec = pickle.load(open('../support/models/vec.model', 'rb'))
fasttext = Word2Vec.load('word2vec_200/word2vec_200.model')
dimension = 200
word_to_idx = list(vec.get_feature_names())

idx_to_word = {}
for i, word in enumerate(word_to_idx):
    idx_to_word[word] = i

In [8]:
abstract_folder = '../support/abstract_dictionaries'
!mkdir word2vec_200_abstract_dictionaries

In [9]:
def iter_id_and_abstracts(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            for id in id_to_abstract_dict:
                yield id, id_to_abstract_dict[id]

        except ValueError:
            continue

In [10]:
def iter_batches(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            yield id_to_abstract_dict

        except ValueError:
            continue

In [11]:
import scipy

def compute_abstract_embedding(abstract):
    tfidf_vec = vec.transform([abstract])
    tfidf_vec = scipy.sparse.coo_matrix(tfidf_vec)
    word_count = 0
    sum_embedding = np.zeros(dimension)
    for _, word_index, word_tfidf in zip(tfidf_vec.row, tfidf_vec.col, tfidf_vec.data):
        word = word_to_idx[word_index]
        if word in fasttext.wv.vocab:
            word_count += 1
            sum_embedding += word_tfidf*fasttext.wv[word]
    
    if word_count == 0:
        return [0]*dimension
    
    return (sum_embedding / word_count).tolist()

def dump_json(d, output_file_path):
    with open(output_file_path, 'w') as output_file:
        json.dump(d, output_file)

In [None]:
id_to_embedding = None
for i, batch in tqdm(enumerate(iter_batches(abstract_folder))):
    id_to_embedding = {}
    for id in batch:
        processed_abstract = etp(batch[id])
        embedding = compute_abstract_embedding(processed_abstract)
        id_to_embedding[id] = embedding

    dump_json(id_to_embedding, 'word2vec_200_abstract_dictionaries/id_to_embedding_dict_{}.json'.format(i))

27it [36:23, 84.75s/it]

In [None]:
!python evaluate_fos_200.py

In [14]:
!python evaluate_fos_150.py

0.0036
0.4
0.0036
0.8571428571428571
0.0034
0.53125
0.0044
0.55
0.005
0.7352941176470589
0.001
0.22727272727272727
0.0008
0.15384615384615385
0.006
0.6976744186046512
0.003
0.3191489361702128
0.002
0.4166666666666667
0.0036
0.6206896551724138
0.0038
0.35185185185185186
0.0034
0.8095238095238095
0.0016
0.3076923076923077
0.0032
0.8
0.007
0.7608695652173914
0.0052
0.9285714285714286
0.0026
0.65
0.0042
0.84
0.0018
0.34615384615384615
0.0034
0.5666666666666667
0.0012
0.2727272727272727
0.0022
0.55
0.0052
0.7878787878787878
0.0026
0.5909090909090909
0.002
0.37037037037037035
0.0068
0.68
0.0038
0.4634146341463415
0.0026
0.325
0.0052
0.48148148148148145
0.0012
0.2608695652173913
0.0054
0.7714285714285715
0.0006
0.13636363636363635
0.0028
0.6666666666666666
0.002
0.2564102564102564
0.0038
0.76
0.0046
0.696969696969697
0.0046
0.6764705882352942
0.003
0.75
0.0
0.0
0.001
0.23809523809523808
0.0018
0.42857142857142855
0.0014
0.3181818181818182
0.0034
0.4857142857142857
0.0058
0.6041666666666666
0.

In [15]:
!python evaluate_fos_150.py

0.012
0.25
0.013
0.38235294117647056
0.006
0.2222222222222222
0.01
0.45454545454545453
0.009
0.375
0.019
0.4418604651162791
0.005
0.22727272727272727
0.016
0.64
0.001
0.047619047619047616
0.021
0.75
0.007
0.175
0.014
0.7
0.009
0.20930232558139536
0.004
0.15384615384615385
0.021
0.6
0.009
0.391304347826087
0.012
0.5714285714285714
0.0
0.0
0.008
0.1702127659574468
0.004
0.14814814814814814
0.01
0.2222222222222222
0.008
0.34782608695652173
0.005
0.06329113924050633
0.006
0.2727272727272727
0.024
0.7058823529411765
0.006
0.2608695652173913
0.004
0.16
0.007
0.1794871794871795
0.005
0.15151515151515152
0.005
0.22727272727272727
0.001
0.038461538461538464
0.016
0.36363636363636365
0.003
0.1111111111111111
0.016
0.5517241379310345
0.008
0.38095238095238093
0.004
0.16666666666666666
0.014
0.6666666666666666
0.002
0.08695652173913043
0.002
0.08695652173913043
0.009
0.42857142857142855
0.018
0.5454545454545454
0.017
0.425
0.005
0.15625
0.016
0.47058823529411764
0.009
0.375
0.005
0.192307692307692

In [16]:
!python evaluate_fos_150.py

0.0
0.0
0.02
0.09090909090909091
0.01
0.03225806451612903
0.07
0.3333333333333333
0.04
0.16666666666666666
0.01
0.030303030303030304
0.0
0.0
0.04
0.2
0.04
0.11764705882352941
0.1
0.4
0.03
0.13043478260869565
0.01
0.047619047619047616
0.0
0.0
0.07
0.2
0.0
0.0
0.0
0.0
0.08
0.38095238095238093
0.02
0.1
0.05
0.11363636363636363
0.06
0.2608695652173913
0.07
0.175
0.01
0.024390243902439025
0.0
0.0
0.04
0.08333333333333333
0.07
0.35
0.05
0.15625
0.04
0.0851063829787234
0.01
0.043478260869565216
0.03
0.125
0.11
0.2037037037037037
0.01
0.030303030303030304
0.03
0.14285714285714285
0.08
0.2
0.0
0.0
0.03
0.14285714285714285
0.04
0.2
0.03
0.13043478260869565
0.02
0.02531645569620253
0.02
0.07407407407407407
0.06
0.13636363636363635
0.09
0.15517241379310345
0.09
0.2727272727272727
0.06
0.24
0.01
0.045454545454545456
0.0
0.0
0.01
0.037037037037037035
0.0
0.0
0.03
0.14285714285714285
0.0
0.0
0.09
0.32142857142857145
0.02
0.09090909090909091
0.05
0.10869565217391304
0.03
0.075
0.07
0.16279069767441862

In [17]:
!python evaluate_fos_150.py

0.04
0.08695652173913043
0.0
0.0
0.0
0.0
0.02
0.043478260869565216
0.06
0.08571428571428572
0.14
0.25925925925925924
0.0
0.0
0.08
0.16666666666666666
0.08
0.11764705882352941
0.08
0.11764705882352941
0.08
0.2
0.08
0.18181818181818182
0.02
0.043478260869565216
0.0
0.0
0.1
0.21739130434782608
0.16
0.13793103448275862
0.04
0.08
0.0
0.0
0.02
0.05
0.02
0.029411764705882353
0.0
0.0
0.04
0.08695652173913043
0.08
0.11428571428571428
0.14
0.25
0.0
0.0
0.06
0.07692307692307693
0.08
0.13793103448275862
0.12
0.17142857142857143
0.0
0.0
0.02
0.041666666666666664
0.02
0.034482758620689655
0.08
0.18181818181818182
0.04
0.09523809523809523
0.02
0.038461538461538464
0.02
0.024390243902439025
0.04
0.1
0.0
0.0
0.08
0.14814814814814814
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.04
0.08333333333333333
0.04
0.043478260869565216
0.02
0.043478260869565216
0.02
0.047619047619047616
0.06
0.09375
0.0
0.0
0.0
0.0
0.0
0.0
0.02
0.045454545454545456
0.0
0.0
0.04
0.08
0.02
0.030303030303030304
0.0
0.0
0.16
0.24242424242424243