In [1]:
import numpy as np
import pandas as pd
import os
import json

In [47]:
def get_paper2issn(use_all=False):
    read_path = '../data/stat_paper.csv'
    write_path = '../data/paper2issn.json'
    if use_all:
        read_path = '../data/paper_info_1990.csv'
        write_path = '../data/paper2issn_all.json'
    stat_paper = pd.read_csv(read_path)
    paper_id, issn = stat_paper['id'].tolist(), stat_paper['issn'].tolist()
    paper2issn = {}
    for pid, pissn in zip(paper_id, issn):
        paper2issn[pid] = pissn
    json.dump(paper2issn, open(write_path, 'w'))

In [45]:
get_paper2issn()

In [13]:
def create_idx(column, save_path):
    mapping = {}
    array = column.tolist()
    for id in array:
        if id not in mapping:
            mapping[id] = len(mapping) + 1
    with open(save_path, 'w') as f:
        json.dump(mapping, f)
    print('Data saved at', save_path)

In [14]:
stat_paper_50 = pd.read_csv('../data/stat_paper_50.csv')
create_idx(stat_paper_50['id'], '../data/paper2idx.json')
create_idx(stat_paper_50['issn'], '../data/journal2idx.json')

Data saved at ../data/paper2idx.json
Data saved at ../data/journal2idx.json


In [46]:
all_paper = pd.read_csv('../data/paper_info_1990.csv')
create_idx(all_paper['issn'], '../data/journal2idx_all.json')

Data saved at ../data/journal2idx_all.json


In [4]:
def get_paper2abstract():
    abstract = pd.read_csv('../data/paper_abstract.csv')
    paper2abstract = {}
    paper_ids, paper_abstract = abstract['id'].tolist(), abstract['abstract'].tolist()
    for id, ab in zip(paper_ids, paper_abstract):
        paper2abstract[id] = ab
    return paper2abstract

In [5]:
def create_abstract_dataset(paper2idx_path, journal2idx_path, paper2issn_path, paper2abstract):
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    
    dataset = []
    for paper_id, _ in paper2idx.items():
        dataset.append({'paper_id': paper_id, 
                        'abstract':paper2abstract[paper_id], 
                        'journal_id':journal2idx[paper2issn[paper_id]]})
    json.dump(dataset, open('../data/dataset_stat_50.json', 'w'))
    return dataset

In [24]:
dataset = create_abstract_dataset('../data/paper2idx.json', 
                         '../data/journal2idx.json', 
                         '../data/paper2issn.json',
                         paper2abstract)

In [8]:
def train_val_test_split(dataset, 
                         save_path,
                         feature_col, 
                         train_ratio=0.8, 
                         val_ratio=0.1, 
                         test_ratio=0.1):
    dataset = np.asarray(dataset)
    permuted_idx = np.random.permutation(dataset.shape[0])
    total_n = dataset.shape[0]
    dataset = dataset[permuted_idx]
    test_idx = int(total_n * test_ratio)
    val_idx = int(total_n * (test_ratio + val_ratio))
    train, val, test = dataset[:-val_idx], dataset[-val_idx:-test_idx], dataset[-test_idx:]
    
    train, val, test = train.tolist(), val.tolist(), test.tolist()
    X_train = [data[feature_col] for data in train]
    y_train = [data['journal_id'] for data in train]
    X_val = [data[feature_col] for data in val]
    y_val = [data['journal_id'] for data in val]
    X_test = [data[feature_col] for data in test]
    y_test = [data['journal_id'] for data in test]
    
    splitted_data = {'X_train': X_train, 'y_train': y_train,
                    'X_val': X_val, 'y_val': y_val,
                    'X_test': X_test, 'y_test': y_test}
    np.save(save_path, splitted_data)

In [32]:
# split abstract dataset
train_val_test_split(dataset, '../data/dataset_abstract_stat_50', feature_col='abstract')

In [21]:
def get_paper2reference():
    reference = pd.read_csv('../data/stat_paper_50_ref_chain.csv')
    paper2reference = {}
    paper_ids, paper_reference = reference['id'].tolist(), reference['ref_chain'].tolist()
    for id, ref in zip(paper_ids, paper_reference):
        paper2reference[id] = json.loads(ref.replace("'", "\""))
    return paper2reference

In [22]:
paper2reference = get_paper2reference()

In [53]:
def create_reference_dataset(save_path,
                            paper2idx_path, 
                             journal2idx_path, 
                             paper2issn_path, 
                             paper2reference):
    
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    label2idx = json.load(open('../data/journal2idx.json', 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    dataset = []
    
    for paper_id, _ in paper2idx.items():
        ref_chain = []
        for sublist in paper2reference[paper_id]:
            for idx, item in enumerate(sublist):
                if idx > 0 and item in paper2issn:
                    ref_chain.append(journal2idx[paper2issn[item]])
        
        dataset.append({'paper_id': paper_id, 
                        'ref_chain': ref_chain, 
                        'journal_id':label2idx[paper2issn[paper_id]]})
    json.dump(dataset, open(save_path, 'w'))
    return dataset


In [55]:
dataset = create_reference_dataset('../data/dataset_stat_50_ref.json',
                        '../data/paper2idx.json', 
                         '../data/journal2idx_all.json', 
                         '../data/paper2issn_all.json',
                         paper2reference)

In [56]:
train_val_test_split(dataset, '../data/dataset_ref_stat_50', feature_col='ref_chain')