In [1]:
import numpy as np
import pandas as pd
import os
import json

In [47]:
def get_paper2issn(use_all=False):
    read_path = '../data/stat_paper.csv'
    write_path = '../data/paper2issn.json'
    if use_all:
        read_path = '../data/paper_info_1990.csv'
        write_path = '../data/paper2issn_all.json'
    stat_paper = pd.read_csv(read_path)
    paper_id, issn = stat_paper['id'].tolist(), stat_paper['issn'].tolist()
    paper2issn = {}
    for pid, pissn in zip(paper_id, issn):
        paper2issn[pid] = pissn
    json.dump(paper2issn, open(write_path, 'w'))

In [45]:
get_paper2issn()

In [13]:
def create_idx(column, save_path):
    mapping = {}
    array = column.tolist()
    for id in array:
        if id not in mapping:
            mapping[id] = len(mapping) + 1
    with open(save_path, 'w') as f:
        json.dump(mapping, f)
    print('Data saved at', save_path)

In [14]:
stat_paper_50 = pd.read_csv('../data/stat_paper_50.csv')
create_idx(stat_paper_50['id'], '../data/paper2idx.json')
create_idx(stat_paper_50['issn'], '../data/journal2idx.json')

Data saved at ../data/paper2idx.json
Data saved at ../data/journal2idx.json


In [46]:
all_paper = pd.read_csv('../data/paper_info_1990.csv')
create_idx(all_paper['issn'], '../data/journal2idx_all.json')

Data saved at ../data/journal2idx_all.json


In [8]:
def get_paper2abstract():
    abstract = pd.read_csv('../data/paper_abstract.csv')
    paper2abstract = {}
    paper_ids, paper_abstract = abstract['id'].tolist(), abstract['abstract'].tolist()
    for id, ab in zip(paper_ids, paper_abstract):
        paper2abstract[id] = ab
    return paper2abstract

In [10]:
paper2abstract = get_paper2abstract()

In [4]:
def get_paper2rank5():
    df = pd.read_csv('../data/stat_paper_50.csv')
    paper2rank5 = {}
    paper_ids, rank5 = df['id'].tolist(), df['rank5'].tolist()
    for id, rank in zip(paper_ids, rank5):
        paper2rank5[id] = rank
    with open('../data/paper2rank5.json', 'w') as f:
        json.dump(paper2rank5, f)
    print('Paper2rank5 saved at ../data/paper2rank5.json')

In [5]:
get_paper2rank5()

Paper2rank5 saved at ../data/paper2rank5.json


In [6]:
def create_abstract_dataset(paper2idx_path, journal2idx_path, paper2issn_path, paper2abstract):
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    paper2rank5 = json.load(open('../data/paper2rank5.json', 'r'))
    
    dataset = []
    for paper_id, _ in paper2idx.items():
        dataset.append({'paper_id': paper_id, 
                        'abstract':paper2abstract[paper_id], 
                        'journal_id':journal2idx[paper2issn[paper_id]],
                        'rank5':paper2rank5[paper_id]})
    json.dump(dataset, open('../data/dataset_stat_50.json', 'w'))
    return dataset

In [None]:
dataset = create_abstract_dataset('../data/paper2idx.json', 
                         '../data/journal2idx.json', 
                         '../data/paper2issn.json',
                         paper2abstract)

In [17]:
permuted_idx = np.random.permutation(np.arange(len(dataset)))
print(permuted_idx)
with open('../data/permuted_idx.json', 'w') as f:
    json.dump(permuted_idx.tolist(), f)

[ 7953 12835 14996 ...  4182   957  6830]


In [2]:
def train_val_test_split(dataset, 
                         save_path,
                         feature_col_list, 
                         train_ratio=0.8, 
                         val_ratio=0.1, 
                         test_ratio=0.1):
    dataset = np.asarray(dataset)
    permuted_idx = json.load(open('../data/permuted_idx.json', 'r'))
    total_n = dataset.shape[0]
    dataset = dataset[permuted_idx]
    test_idx = int(total_n * test_ratio)
    val_idx = int(total_n * (test_ratio + val_ratio))
    train, val, test = dataset[:-val_idx], dataset[-val_idx:-test_idx], dataset[-test_idx:]
    
    train, val, test = train.tolist(), val.tolist(), test.tolist()
    splitted_data = {}
    
    for phase, subset in zip(['train', 'val', 'test'], [train, val, test]):
        for feature_col in feature_col_list:
            name = 'X_{}_{}'.format(phase, feature_col)
            feature = [data[feature_col] for data in subset]
            splitted_data[name] = feature
        name_journal = 'y_{}_journal'.format(phase)
        name_rank = 'y_{}_rank'.format(phase)

        y_journal = [data['journal_id'] for data in subset]
        y_rank = [data['rank5'] for data in subset]

        splitted_data[name_journal] = y_journal
        splitted_data[name_rank] = y_rank
            
    print(list(splitted_data.keys()))
#     X_train = [data[feature_col] for data in train]
#     y_train = [data['journal_id'] for data in train]
#     X_val = [data[feature_col] for data in val]
#     y_val = [data['journal_id'] for data in val]
#     X_test = [data[feature_col] for data in test]
#     y_test = [data['journal_id'] for data in test]
    
#     splitted_data = {'X_train': X_train, 'y_train': y_train,
#                     'X_val': X_val, 'y_val': y_val,
#                     'X_test': X_test, 'y_test': y_test}
    np.save(save_path, splitted_data)

In [23]:
# split abstract dataset
train_val_test_split(dataset, '../data/dataset_abstract_stat_50', feature_col_list=['abstract'])

['X_train_abstract', 'y_train_journal', 'y_train_rank', 'X_val_abstract', 'y_val_journal', 'y_val_rank', 'X_test_abstract', 'y_test_journal', 'y_test_rank']


In [24]:
def get_paper2chain():
    reference = pd.read_csv('../data/stat_paper_50_ref_chain.csv')
    paper2reference = {}
    paper_ids, paper_reference = reference['id'].tolist(), reference['ref_chain'].tolist()
    for id, ref in zip(paper_ids, paper_reference):
        paper2reference[id] = json.loads(ref.replace("'", "\""))
    return paper2reference

In [25]:
paper2chain = get_paper2chain()

In [3]:
def get_paper2neighbor():
    df = pd.read_csv('../data/stat_paper_50_neighbors.csv')
    paper2nb = {}
    ids = df['id'].tolist()
    neighbors = df['neighbors'].tolist()
    
    for id, ref in zip(ids, neighbors):
        level1 = []
        level2 = []
        adjusted_ref = ref.replace("'", "\"")
        adjusted_ref = adjusted_ref.replace("(", "[")
        adjusted_ref = adjusted_ref.replace(")", "]")
        nbs = json.loads(adjusted_ref)
        for item in nbs:
            level1.append(item[0])
            level2.extend(list(item[1]))
        paper2nb[id] = (level1, level2)
    return paper2nb
    
paper2neighbor = get_paper2neighbor()

In [42]:
def create_reference_chain_dataset(save_path,
                            paper2idx_path, 
                             journal2idx_path, 
                             paper2issn_path, 
                             paper2reference):
    
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    paper2rank5 = json.load(open('../data/paper2rank5.json', 'r'))
    dataset = []
    
    for paper_id, _ in paper2idx.items():
        ref_chain = []
        for sublist in paper2reference[paper_id]:
            for idx, item in enumerate(sublist):
                if idx > 0 and item in paper2issn:
                    ref_chain.append(journal2idx[paper2issn[item]])
        
        dataset.append({'paper_id': paper_id, 
                        'ref_chain': ref_chain, 
                        'journal_id':label2idx[paper2issn[paper_id]],
                        'rank5': paper2rank5[paper_id]})
    json.dump(dataset, open(save_path, 'w'))
    return dataset


In [45]:
dataset = create_reference_chain_dataset('../data/dataset_stat_50_ref.json',
                        '../data/paper2idx.json', 
                         '../data/journal2idx_all.json', 
                         '../data/paper2issn_all.json',
                         paper2chain)

NameError: name 'paper2chain' is not defined

In [44]:
train_val_test_split(dataset, '../data/dataset_ref_chain_stat_50', feature_col_list=['ref_chain'])

['X_train_ref_chain', 'y_train_journal', 'y_train_rank', 'X_val_ref_chain', 'y_val_journal', 'y_val_rank', 'X_test_ref_chain', 'y_test_journal', 'y_test_rank']


In [8]:
def create_reference_neighbor_dataset(save_path,
                            paper2idx_path, 
                             journal2idx_path, 
                             paper2issn_path, 
                             paper2reference):
    
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    paper2rank5 = json.load(open('../data/paper2rank5.json', 'r'))
    dataset = []
    for paper_id, _ in paper2idx.items():
        level1, level2 = paper2reference[paper_id][0], paper2reference[paper_id][1]
        issn1 = []
        issn2 = []
        for item in level1:
            if item == '_PAD_':
                issn1.append(0)
            else:
                try:
                    issn1.append(journal2idx[paper2issn[item]])
                except:
                    print(paper2issn[item])
                    issn1.append(0)
        for item in level2:
            if item == '_PAD_':
                issn2.append(0)
            else:
                try:
                    issn2.append(journal2idx[paper2issn[item]])
                except:
                    print(paper2issn[item])
                    issn2.append(0)
        
        dataset.append({'paper_id': paper_id, 
                        'ref_level1': issn1, 
                        'ref_level2': issn2,
                        'journal_id':label2idx[paper2issn[paper_id]],
                        'rank5': paper2rank5[paper_id]})
    json.dump(dataset, open(save_path, 'w'))
    return dataset

In [9]:
dataset = create_reference_neighbor_dataset('../data/dataset_stat_50_ref.json',
                        '../data/paper2idx.json', 
                         '../data/journal2idx_all.json', 
                         '../data/paper2issn_all.json',
                         paper2neighbor)


In [10]:
train_val_test_split(dataset, '../data/dataset_ref_nb_stat_50', feature_col_list=['ref_level1', 'ref_level2'])

['X_train_ref_level1', 'X_train_ref_level2', 'y_train_journal', 'y_train_rank', 'X_val_ref_level1', 'X_val_ref_level2', 'y_val_journal', 'y_val_rank', 'X_test_ref_level1', 'X_test_ref_level2', 'y_test_journal', 'y_test_rank']
