In [25]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import random

In [2]:
# Read in data
big_data = pd.read_csv("paper_info_1990.csv")
stat_data = pd.read_csv("stat_paper.csv")

In [3]:
# Read in stat 50 data and set of paper id
stat_50_data = pd.read_csv("stat_paper_50.csv")
all_stat_50_paper_set = set(stat_data.loc[:, 'id'])

In [5]:
# Get sets of paper id
all_paper_set = set(big_data.loc[:, 'id'])
all_stat_paper_set = set(stat_data.loc[:, 'id'])

In [6]:
# Set index as id for faster search
all_data = big_data.set_index('id')
all_data.loc[:,'refereneces'] = all_data.loc[:,'refereneces'].apply(lambda x: literal_eval(x))

In [8]:
all_data.head()

Unnamed: 0_level_0,year,refereneces,issn,subjects
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
53e99784b7602d9701f3e13e,2011,"[53e99b4ab7602d97023ea8fd, 53e997f8b7602d9701f...",15734919,"[1308, 1312, 1307]"
53e99784b7602d9701f3e4f2,2002,"[53e9a611b7602d9702f42159, 53e9b316b7602d9703d...",10761551,"[2716, 1311, 1313, 1312]"
53e9978db7602d9701f4f415,2005,"[53e9aa41b7602d97033af1b3, 53e9b1b0b7602d9703c...",12138118,[1300]
53e99792b7602d9701f56a86,2004,"[53e9abbeb7602d970356d932, 53e9ad6cb7602d97037...",195545,[2738]
53e99792b7602d9701f5af0e,1993,"[53e99d04b7602d97025b8c26, 53e9a945b7602d97032...",9368051,"[2746, 2732]"


In [7]:
big_data.head()

Unnamed: 0,id,year,refereneces,issn,subjects
0,53e99784b7602d9701f3e13e,2011,"['53e99b4ab7602d97023ea8fd', '53e997f8b7602d97...",15734919,"[1308, 1312, 1307]"
1,53e99784b7602d9701f3e4f2,2002,"['53e9a611b7602d9702f42159', '53e9b316b7602d97...",10761551,"[2716, 1311, 1313, 1312]"
2,53e9978db7602d9701f4f415,2005,"['53e9aa41b7602d97033af1b3', '53e9b1b0b7602d97...",12138118,[1300]
3,53e99792b7602d9701f56a86,2004,"['53e9abbeb7602d970356d932', '53e9ad6cb7602d97...",195545,[2738]
4,53e99792b7602d9701f5af0e,1993,"['53e99d04b7602d97025b8c26', '53e9a945b7602d97...",9368051,"[2746, 2732]"


In [7]:
stat_data.head(10)

Unnamed: 0,id,issn,citescore,rank5,rank10
0,53e99792b7602d9701f5b3b6,2776715,1.99,0,1
1,53e99796b7602d9701f5e172,225193,1.93,0,1
2,53e997a2b7602d9701f73f10,9434062,0.98,2,4
3,53e997a6b7602d9701f78af9,13674811,7.84,0,0
4,53e997a6b7602d9701f7a6d3,18770541,0.88,2,4
5,53e997aeb7602d9701f8da92,18638279,1.02,2,4
6,53e997b2b7602d9701f912d5,225193,1.93,0,1
7,53e997ccb7602d9701fbf03d,3032647,1.61,1,2
8,53e997ccb7602d9701fbf070,273171,2.75,0,0
9,53e997ccb7602d9701fbff6e,13674811,7.84,0,0


In [39]:
def get_ok(this_id):
    
    ref = all_data.loc[this_id, 'refereneces']
    stay_list = []
    next_ref_list = []
    for r in ref:
        if r in all_stat_50_paper_set:
            next_ref_list.append(r)
        elif r in all_paper_set:
            stay_list.append(r)
    
    return stay_list, next_ref_list

In [22]:
def find_best(this_id, num_neighbor):
    stay_list, next_ref_list = get_ok(this_id)
    next_level = next_ref_list
    still_need = num_neighbor - len(next_level)
    # If papers from stats is not enough
    if still_need > 0:
        if len(stay_list) <= still_need:
            next_level += stay_list
            # Fill with _PAD_ if still not enough
            while(len(next_level) < 5):
                next_level += ['_PAD_']
        else:
            added = random.sample(stay_list, still_need)
            next_level += added
    # If too many papers, use the ones with most papers in stats
    elif still_need < 0:
        best_next = Counter()
        for i in next_level:
            s, n = get_ok(i)
            best_next[i] += len(n)
        next_level = [x for x,count in best_next.most_common(5)]
    return next_level

In [27]:
def find_neighbors(start_id, num_neighbor):
    retval = []
    next_level = find_best(start_id, num_neighbor)
    for i in next_level:
        if i != '_PAD_':
            next_next = tuple(find_best(i, num_neighbor))
            retval.append((i, next_next))
        else:
            retval.append((i, ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_')))
    return(retval)

In [32]:
retval = find_neighbors('53e99792b7602d9701f5b3b6', 5)
retval

[('53e9a3a4b7602d9702cb0e6f', ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_')),
 ('53e9b500b7602d970402edcd', ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_')),
 ('53e99da4b7602d970265f67a', ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_')),
 ('53e9a2c0b7602d9702bc81c2', ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_')),
 ('53e99d1bb7602d97025d106f', ('_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_'))]

In [40]:
stat_50_data_new = stat_50_data

In [41]:
all_neighbors = []
for i in range(len(stat_50_data)):
    if i % 1000 == 0:
        print(i)
    id_used = stat_50_data_new.loc[i, 'id']
    result = find_neighbors(id_used, 5)
    all_neighbors.append(result)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


In [42]:
stat_50_data_new['neighbors'] = all_neighbors

In [43]:
stat_50_data_new.head()

Unnamed: 0,id,issn,citescore,rank5,rank10,neighbors
0,53e99792b7602d9701f5b3b6,2776715,1.99,0,1,"[(53e9a3a4b7602d9702cb0e6f, (_PAD_, _PAD_, _PA..."
1,53e99796b7602d9701f5e172,225193,1.93,0,1,"[(_PAD_, (_PAD_, _PAD_, _PAD_, _PAD_, _PAD_)),..."
2,53e997a6b7602d9701f78af9,13674811,7.84,0,0,"[(53e9a3d4b7602d9702ce78cb, (53e99b2cb7602d970..."
3,53e997aeb7602d9701f8da92,18638279,1.02,2,4,"[(53e9a48eb7602d9702dab5d7, (_PAD_, _PAD_, _PA..."
4,53e997b2b7602d9701f912d5,225193,1.93,0,1,"[(53e9b0f4b7602d9703b70750, (53e9a914b7602d970..."


In [44]:
stat_50_data_new.to_csv("stat_paper_50_neighbors.csv")