In [10]:
from utility import EnglishTextProcessor
import numpy as np
from tqdm import tqdm
import json

In [3]:
from gensim.models import FastText, Word2Vec

In [12]:
etp = EnglishTextProcessor()
author = load_json('author_filtered_citations.json')

In [11]:
def load_json(filename):
    d = None
    with open(filename) as file:
        d = json.load(file)
    
    return d

In [4]:
import json
import os
import pickle

In [5]:
vec = pickle.load(open('../support/models/vec.model', 'rb'))
fasttext = Word2Vec.load('word2vec_150/word2vec_150.model')
dimension = 150
word_to_idx = list(vec.get_feature_names())

idx_to_word = {}
for i, word in enumerate(word_to_idx):
    idx_to_word[word] = i

In [6]:
abstract_folder = '../support/abstract_dictionaries'
!mkdir word2vec_150_abstract_dictionaries

mkdir: cannot create directory ‘word2vec_150_abstract_dictionaries’: File exists


In [7]:
def iter_id_and_abstracts(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            for id in id_to_abstract_dict:
                yield id, id_to_abstract_dict[id]

        except ValueError:
            continue

In [8]:
def iter_batches(abstract_folder):
    json_file_names = sorted(os.listdir(abstract_folder), key=lambda x: int(x.split('_')[-1].split('.')[0]))
    for file_name in json_file_names:
        file_path = os.path.join(abstract_folder, file_name)
        try:
            id_to_abstract_dict = None
            with open(file_path) as id_to_abstract_json_file:
                id_to_abstract_dict = json.load(id_to_abstract_json_file)

            yield id_to_abstract_dict

        except ValueError:
            continue

In [9]:
import scipy

def compute_abstract_embedding(abstract):
    tfidf_vec = vec.transform([abstract])
    tfidf_vec = scipy.sparse.coo_matrix(tfidf_vec)
    word_count = 0
    sum_embedding = np.zeros(dimension)
    for _, word_index, word_tfidf in zip(tfidf_vec.row, tfidf_vec.col, tfidf_vec.data):
        word = word_to_idx[word_index]
        if word in fasttext.wv.vocab:
            word_count += 1
            sum_embedding += word_tfidf*fasttext.wv[word]
    
    if word_count == 0:
        return [0]*dimension
    
    return (sum_embedding / word_count).tolist()

def dump_json(d, output_file_path):
    with open(output_file_path, 'w') as output_file:
        json.dump(d, output_file)

In [14]:
relevant_ids = set(test_recs.keys())
for id in test_recs:
    relevant_ids = relevant_ids.union(test_recs[id])

In [13]:
from aminer.recall.query_es import get_abstract_by_pids, get_references_by_pid

test_recs = {}
for id in author:
    test_recs[id] = get_references_by_pid(id)

In [17]:
id_to_abstract = {}
for i, batch in tqdm(enumerate(iter_batches(abstract_folder))):
    for id in batch:
        if id in relevant_ids:
            id_to_abstract[id] = batch[id]

411it [00:24, 16.71it/s]


In [20]:
id_to_processed_abstracts = {}
for id in tqdm(id_to_abstract):
    id_to_processed_abstracts[id] = etp(id_to_abstract[id])

100%|██████████| 2909/2909 [00:19<00:00, 148.93it/s]


In [27]:
id_ranked_candidates = {}
for id in tqdm(author):
    id_ranked_candidates[id] = {}
    for cid in relevant_ids:
        id_ranked_candidates[id][cid] = len(set(id_to_processed_abstracts[id].split()).intersection(id_to_processed_abstracts[cid].split()))

100%|██████████| 96/96 [00:06<00:00, 15.77it/s]


In [33]:
o = {5:0, 10:0, 20:0, 100:0, 200:0, 500:0}
for id in author:
    top_candidates = sorted(id_ranked_candidates[id], reverse=True, key=lambda x: id_ranked_candidates[id][x])
    for k in o:
        o[k] += len(set(top_candidates[:k]).intersection(test_recs[id])) / len(test_recs[id])

for k in o:
    o[k] /= len(author)

o

{5: 0.10004882069107601,
 10: 0.1841244780565238,
 20: 0.28427979002187265,
 100: 0.5294365110339784,
 200: 0.636117073167133,
 500: 0.7639334480815777}

In [43]:
def get_word_scores(abstract):
    tfidf_vec = vec.transform([abstract])
    tfidf_vec = scipy.sparse.coo_matrix(tfidf_vec)
    word_scores = {}
    for _, word_index, word_tfidf in zip(tfidf_vec.row, tfidf_vec.col, tfidf_vec.data):
        word = word_to_idx[word_index]
        word_scores[word] = word_tfidf
    
    return word_scores

relevant_word_scores = {id:get_word_scores(id_to_processed_abstracts[id]) for id in relevant_ids}

In [48]:
id_word_ranked_candidates = {}
for id in tqdm(author):
    id_word_ranked_candidates[id] = {}
    word_scores = relevant_word_scores[id]
    for cid in relevant_ids:
        cid_word_scores = relevant_word_scores[cid]
        id_word_ranked_candidates[id][cid] = 0
        for word in id_to_processed_abstracts[cid].split():
            if word in word_scores:
                id_word_ranked_candidates[id][cid] += cid_word_scores[word]

100%|██████████| 96/96 [00:05<00:00, 18.84it/s]


In [45]:
o = {5:0, 10:0, 20:0, 100:0, 200:0, 500:0}
for id in author:
    top_candidates = sorted(id_word_ranked_candidates[id], reverse=True, key=lambda x: id_word_ranked_candidates[id][x])
    for k in o:
        o[k] += len(set(top_candidates[:k]).intersection(test_recs[id])) / len(test_recs[id])

for k in o:
    o[k] /= len(author)

o

{5: 0.10430489749156723,
 10: 0.20319923072790705,
 20: 0.3353315310665032,
 100: 0.6437471794526587,
 200: 0.744752744417582,
 500: 0.8529969670368599}

In [47]:
o = {5:0, 10:0, 20:0, 100:0, 200:0, 500:0}
for id in author:
    top_candidates = sorted(id_word_ranked_candidates[id], reverse=True, key=lambda x: id_word_ranked_candidates[id][x])
    for k in o:
        o[k] += len(set(top_candidates[:k]).intersection(test_recs[id])) / len(test_recs[id])

for k in o:
    o[k] /= len(author)

o

{5: 0.08491765998206108,
 10: 0.16507621052658253,
 20: 0.2922620044607936,
 100: 0.6195310976773454,
 200: 0.7350883294516697,
 500: 0.8674783492212949}

In [49]:
o = {5:0, 10:0, 20:0, 100:0, 200:0, 500:0}
for id in author:
    top_candidates = sorted(id_word_ranked_candidates[id], reverse=True, key=lambda x: id_word_ranked_candidates[id][x])
    for k in o:
        o[k] += len(set(top_candidates[:k]).intersection(test_recs[id])) / len(test_recs[id])

for k in o:
    o[k] /= len(author)

o

{5: 0.06481638419128384,
 10: 0.13039235099731436,
 20: 0.22343254824642367,
 100: 0.5277164798782078,
 200: 0.6719524684163808,
 500: 0.8190717875210481}

In [50]:
both = load_json('fos_author_filtered.json')

In [51]:
both_relevant_ids = set()
for id in both:
    both_relevant_ids.add(id)
    both_relevant_ids = both_relevant_ids.union(both[id])

In [52]:
len(both_relevant_ids)

2243938

In [58]:
both_id_to_abstract = {}
for i, batch in tqdm(enumerate(iter_batches(abstract_folder))):
    for id in batch:
        if id in both_relevant_ids:
            both_id_to_abstract[id] = batch[id]

411it [00:14, 28.98it/s]


In [1]:
!free -m

             total       used       free     shared    buffers     cached
Mem:         31144      24029       7115          0         51      23057
-/+ buffers/cache:        920      30224
Swap:            0          0          0


In [60]:
for id in tqdm(both_id_to_abstract):
    both_id_to_abstract[id] = both_id_to_abstract[id].lower()


  0%|          | 0/2243938 [00:00<?, ?it/s][A
  2%|▏         | 36692/2243938 [00:00<00:06, 366916.85it/s][A
  3%|▎         | 77663/2243938 [00:00<00:05, 378785.41it/s][A
  5%|▌         | 119400/2243938 [00:00<00:05, 389588.63it/s][A
  7%|▋         | 161359/2243938 [00:00<00:05, 398128.52it/s][A
  9%|▉         | 203691/2243938 [00:00<00:05, 405365.26it/s][A
 11%|█         | 245859/2243938 [00:00<00:04, 410124.83it/s][A
 13%|█▎        | 288337/2243938 [00:00<00:04, 414412.18it/s][A
 15%|█▍        | 330833/2243938 [00:00<00:04, 417519.20it/s][A
 17%|█▋        | 373312/2243938 [00:00<00:04, 419669.54it/s][A
 19%|█▊        | 415827/2243938 [00:01<00:04, 421297.41it/s][A
 20%|██        | 458716/2243938 [00:01<00:04, 423544.84it/s][A
 22%|██▏       | 501225/2243938 [00:01<00:04, 424005.81it/s][A
 24%|██▍       | 543945/2243938 [00:01<00:04, 424958.56it/s][A
 26%|██▌       | 586847/2243938 [00:01<00:03, 426168.11it/s][A
 28%|██▊       | 629794/2243938 [00:01<00:03, 427147.07it/

In [63]:
both_id_word_ranked_candidates = {}
for id in tqdm(author):
    both_id_word_ranked_candidates[id] = {}
    word_scores = relevant_word_scores[id]
    for cid in both[id]:
        both_id_word_ranked_candidates[id][cid] = 0
        for word in both_id_to_abstract[cid].split():
            if word in word_scores:
                both_id_word_ranked_candidates[id][cid] += word_scores[word]


  0%|          | 0/96 [00:00<?, ?it/s][A
  1%|          | 1/96 [00:00<00:28,  3.39it/s][A
  2%|▏         | 2/96 [00:00<00:24,  3.84it/s][A
  3%|▎         | 3/96 [00:01<00:38,  2.40it/s][A
  4%|▍         | 4/96 [00:01<00:33,  2.75it/s][A
  5%|▌         | 5/96 [00:01<00:32,  2.78it/s][A
  6%|▋         | 6/96 [00:02<00:30,  2.92it/s][A
  8%|▊         | 8/96 [00:02<00:27,  3.21it/s][A
  9%|▉         | 9/96 [00:04<01:04,  1.35it/s][A
 10%|█         | 10/96 [00:04<00:59,  1.45it/s][A
 11%|█▏        | 11/96 [00:07<01:44,  1.23s/it][A
 12%|█▎        | 12/96 [00:07<01:16,  1.10it/s][A
 14%|█▎        | 13/96 [00:10<02:06,  1.52s/it][A
 15%|█▍        | 14/96 [00:11<01:49,  1.33s/it][A
 16%|█▌        | 15/96 [00:12<01:30,  1.11s/it][A
 17%|█▋        | 16/96 [00:12<01:09,  1.15it/s][A
 18%|█▊        | 17/96 [00:12<00:55,  1.43it/s][A
 19%|█▉        | 18/96 [00:12<00:45,  1.73it/s][A
 20%|█▉        | 19/96 [00:15<01:19,  1.03s/it][A
 21%|██        | 20/96 [00:16<01:36,  1.27s/it]

In [74]:
o = {5:0, 10:0, 20:0, 100:0, 200:0, 500:0, 20000:0, 1000000:0}
for id in tqdm(both):
    if id in both_id_word_ranked_candidates:
        top_candidates = sorted(both_id_word_ranked_candidates[id], reverse=True, key=lambda x: both_id_word_ranked_candidates[id][x])
        for k in o:
            o[k] += len(set(top_candidates[:k]).intersection(test_recs[id])) / len(test_recs[id])

for k in o:
    o[k] /= len(author)

o


  0%|          | 0/101 [00:00<?, ?it/s][A
  9%|▉         | 9/101 [00:00<00:01, 65.47it/s][A
 13%|█▎        | 13/101 [00:00<00:02, 36.76it/s][A
 17%|█▋        | 17/101 [00:00<00:02, 35.97it/s][A
 21%|██        | 21/101 [00:00<00:02, 27.91it/s][A
 27%|██▋       | 27/101 [00:00<00:02, 31.96it/s][A
 30%|██▉       | 30/101 [00:00<00:02, 29.09it/s][A
 38%|███▊      | 38/101 [00:01<00:01, 33.55it/s][A
 42%|████▏     | 42/101 [00:01<00:01, 30.38it/s][A
 47%|████▋     | 47/101 [00:01<00:01, 29.05it/s][A
 50%|█████     | 51/101 [00:01<00:01, 27.04it/s][A
 53%|█████▎    | 54/101 [00:01<00:01, 23.50it/s][A
 57%|█████▋    | 58/101 [00:01<00:01, 26.81it/s][A
 65%|██████▌   | 66/101 [00:01<00:01, 33.27it/s][A
 70%|███████   | 71/101 [00:02<00:00, 36.56it/s][A
 75%|███████▌  | 76/101 [00:02<00:00, 26.22it/s][A
 80%|████████  | 81/101 [00:02<00:00, 30.11it/s][A
 88%|████████▊ | 89/101 [00:02<00:00, 36.94it/s][A
 94%|█████████▍| 95/101 [00:02<00:00, 38.06it/s][A
100%|██████████| 101/

{5: 0.00995764948887693,
 10: 0.018193875250690678,
 20: 0.0349544179698604,
 100: 0.1124268393616563,
 200: 0.16848297502077134,
 500: 0.2533215779445238,
 20000: 0.6507391763317253,
 1000000: 0.6860917114441111}

In [66]:
len(both_id_word_ranked_candidates[list(both_id_word_ranked_candidates.keys())[0]])

12355

In [69]:
len(set(both.keys()).intersection(both_id_word_ranked_candidates.keys()))

96

In [70]:
for id in both:
    if id not in both_id_word_ranked_candidates:
        print(id)

1551676982
2163666621
2004111797
1992078202
1575030031


In [75]:
c = {}
for id in tqdm(both):
    if id in both_id_word_ranked_candidates:
        top_candidates = sorted(both_id_word_ranked_candidates[id], reverse=True, key=lambda x: both_id_word_ranked_candidates[id][x])
        c[id] = top_candidates[:20000]


  0%|          | 0/101 [00:00<?, ?it/s][A
  9%|▉         | 9/101 [00:00<00:01, 71.99it/s][A
 13%|█▎        | 13/101 [00:00<00:02, 40.77it/s][A
 17%|█▋        | 17/101 [00:00<00:02, 39.93it/s][A
 21%|██        | 21/101 [00:00<00:02, 30.91it/s][A
 27%|██▋       | 27/101 [00:00<00:02, 35.41it/s][A
 31%|███       | 31/101 [00:00<00:01, 35.06it/s][A
 38%|███▊      | 38/101 [00:00<00:01, 38.84it/s][A
 42%|████▏     | 42/101 [00:01<00:01, 34.69it/s][A
 47%|████▋     | 47/101 [00:01<00:01, 33.03it/s][A
 50%|█████     | 51/101 [00:01<00:01, 30.60it/s][A
 54%|█████▍    | 55/101 [00:01<00:01, 26.31it/s][A
 60%|██████    | 61/101 [00:01<00:01, 31.57it/s][A
 68%|██████▊   | 69/101 [00:01<00:00, 38.56it/s][A
 74%|███████▍  | 75/101 [00:02<00:00, 30.22it/s][A
 80%|████████  | 81/101 [00:02<00:00, 35.20it/s][A
 89%|████████▉ | 90/101 [00:02<00:00, 42.09it/s][A
100%|██████████| 101/101 [00:02<00:00, 36.79it/s][A


In [77]:
dump_json(c, 'idf_filter.json')

In [90]:
from aminer.precision.metrics import recall, precision

ids = list(both_id_word_ranked_candidates.keys())
recommendations = c
rec = 0
for pid, recs in tqdm(recommendations.items()):
    pred_reference_list = recs[:500]
    true_reference_list = get_references_by_pid(pid)
    example_rec = len(set(pred_reference_list).intersection(test_recs[pid])) / len(test_recs[pid])
    rec += example_rec

print('Overall Recall: ', rec / len(recommendations))


  0%|          | 0/96 [00:00<?, ?it/s][A
  5%|▌         | 5/96 [00:00<00:01, 45.67it/s][A
 10%|█         | 10/96 [00:00<00:01, 45.88it/s][A
 16%|█▌        | 15/96 [00:00<00:01, 46.37it/s][A
 21%|██        | 20/96 [00:00<00:01, 45.64it/s][A
 26%|██▌       | 25/96 [00:00<00:01, 46.45it/s][A
 31%|███▏      | 30/96 [00:00<00:01, 45.76it/s][A
 36%|███▋      | 35/96 [00:00<00:01, 45.89it/s][A
 42%|████▏     | 40/96 [00:00<00:01, 46.74it/s][A
 47%|████▋     | 45/96 [00:00<00:01, 46.90it/s][A
 52%|█████▏    | 50/96 [00:01<00:00, 47.64it/s][A
 57%|█████▋    | 55/96 [00:01<00:00, 48.06it/s][A
 62%|██████▎   | 60/96 [00:01<00:00, 41.02it/s][A
 69%|██████▉   | 66/96 [00:01<00:00, 42.90it/s][A
 75%|███████▌  | 72/96 [00:01<00:00, 44.32it/s][A
 81%|████████▏ | 78/96 [00:01<00:00, 45.53it/s][A
 86%|████████▋ | 83/96 [00:01<00:00, 46.69it/s][A
 93%|█████████▎| 89/96 [00:01<00:00, 48.19it/s][A
100%|██████████| 96/96 [00:02<00:00, 46.39it/s][A

Overall Recall:  0.2533215779445238





In [4]:
!python ev.py

0 Finding recommendations for:  1836026262
100%|█████████████████████████████████████████| 411/411 [03:32<00:00,  1.94it/s]
1 Finding recommendations for:  1988433392
  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|█████████████████████████████████████████| 411/411 [03:31<00:00,  1.94it/s]
2 Finding recommendations for:  1526540304
100%|█████████████████████████████████████████| 411/411 [03:32<00:00,  1.93it/s]
3 Finding recommendations for:  1562794883
100%|█████████████████████████████████████████| 411/411 [03:32<00:00,  1.93it/s]
4 Finding recommendations for:  1836578358
100%|█████████████████████████████████████████| 411/411 [03:34<00:00,  1.92it/s]
5 Finding recommendations for:  1773709311
100%|█████████████████████████████████████████| 411/411 [03:32<00:00,  1.93it/s]
6 Finding recommendations for:  1557199071
100%|█████████████████████████████████████████| 411/411 [03:34<00:00,  1.91it/s]
7 Finding recommendations for:  2149532911
100%|█████████████████████████████████████████| 411/

100%|█████████████████████████████████████████| 411/411 [03:41<00:00,  1.86it/s]
64 Finding recommendations for:  1575030031
100%|█████████████████████████████████████████| 411/411 [06:18<00:00,  1.09it/s]
65 Finding recommendations for:  1523661875
100%|█████████████████████████████████████████| 411/411 [03:41<00:00,  1.86it/s]
66 Finding recommendations for:  2163666621
100%|█████████████████████████████████████████| 411/411 [06:20<00:00,  1.08it/s]
67 Finding recommendations for:  1551676982
100%|█████████████████████████████████████████| 411/411 [06:19<00:00,  1.08it/s]
68 Finding recommendations for:  2009266105
100%|█████████████████████████████████████████| 411/411 [03:40<00:00,  1.86it/s]
69 Finding recommendations for:  2152197803
100%|█████████████████████████████████████████| 411/411 [03:40<00:00,  1.87it/s]
70 Finding recommendations for:  2105582967
100%|█████████████████████████████████████████| 411/411 [03:41<00:00,  1.86it/s]
71 Finding recommendations for:  1838714828


In [5]:
!python ev.py

0.1
0.19230769230769232
0.04
0.1
0.06
0.15
0.06
0.15
0.0
0.0
0.14
0.12962962962962962
0.08
0.16666666666666666
0.04
0.0425531914893617
0.06
0.09375
0.08
0.11764705882352941
0.02
0.041666666666666664
0.02
0.05
0.02
0.047619047619047616
0.0
0.0
0.08
0.1
0.1
0.11627906976744186
0.0
0.0
0.06
0.15
0.04
0.1
0.0
0.0
0.02
0.03125
0.0
0.0
0.08
0.2
0.0
0.0
0.0
0.0
0.04
0.08
0.0
0.0
0.08
0.14814814814814814
0.08
0.11428571428571428
0.04
0.09523809523809523
0.0
0.0
0.02
0.02564102564102564
0.0
0.0
0.0
0.0
0.0
0.0
0.04
0.08695652173913043
0.02
0.03225806451612903
0.04
0.09523809523809523
0.02
0.043478260869565216
0.08
0.11428571428571428
0.14
0.3333333333333333
0.14
0.28
0.04
0.08695652173913043
0.14
0.25
0.02
0.034482758620689655
0.02
0.023255813953488372
0.04
0.045454545454545456
0.02
0.038461538461538464
0.02
0.029411764705882353
0.06
0.08571428571428572
0.0
0.0
0.0
0.0
0.06
0.07692307692307693
0.0
0.0
0.06
0.075
0.14
0.175
0.04
0.041666666666666664
0.02
0.030303030303030304
0.08
0.0909090909090