In [6]:
import numpy as np
import pandas as pd
import os
import json

In [2]:
stat_paper_50 = pd.read_csv('../data/stat_paper_50.csv')

In [3]:
print(stat_paper_50.head())

                         id      issn  citescore  rank5  rank10
0  53e99792b7602d9701f5b3b6  02776715       1.99      0       1
1  53e99796b7602d9701f5e172  00225193       1.93      0       1
2  53e997a6b7602d9701f78af9  13674811       7.84      0       0
3  53e997aeb7602d9701f8da92  18638279       1.02      2       4
4  53e997b2b7602d9701f912d5  00225193       1.93      0       1


In [18]:
paper_id, issn = stat_paper_50['id'].tolist(), stat_paper_50['issn'].tolist()
paper2issn = {}
for pid, pissn in zip(paper_id, issn):
    paper2issn[pid] = pissn
json.dump(paper2issn, open('../data/paper2issn.json', 'w'))

In [7]:
def create_idx(column, save_path):
    mapping = {}
    array = column.tolist()
    for id in array:
        if id not in mapping:
            mapping[id] = len(mapping) + 1
    with open(save_path, 'w') as f:
        json.dump(mapping, f)
    print('Data saved at', save_path)
    

In [8]:
create_idx(stat_paper_50['id'], '../data/paper2idx.json')

Data saved at ../data/paper2idx.json


In [9]:
create_idx(stat_paper_50['issn'], '../data/journal2idx.json')

Data saved at ../data/journal2idx.json


In [11]:
abstract = pd.read_csv('../data/paper_abstract.csv')

In [15]:
abstract = abstract[['id', 'abstract']]

In [17]:
paper2abstract = {}
paper_ids, paper_abstract = abstract['id'].tolist(), abstract['abstract'].tolist()
for id, ab in zip(paper_ids, paper_abstract):
    paper2abstract[id] = ab

In [24]:
def create_dataset(paper2idx_path, journal2idx_path, paper2issn_path, paper2abstract):
    paper2idx = json.load(open(paper2idx_path, 'r'))
    journal2idx = json.load(open(journal2idx_path, 'r'))
    paper2issn = json.load(open(paper2issn_path, 'r'))
    
    dataset = []
    for paper_id, _ in paper2idx.items():
        dataset.append({'paper_id': paper_id, 
                        'abstract':paper2abstract[paper_id], 
                        'journal_id':journal2idx[paper2issn[paper_id]]})
    json.dump(dataset, open('../data/dataset_stat_50.json', 'w'))
    return dataset
    
dataset = create_dataset('../data/paper2idx.json', 
                         '../data/journal2idx.json', 
                         '../data/paper2issn.json',
                         paper2abstract)

In [32]:
def train_val_test_split(dataset, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    dataset = np.asarray(dataset)
    permuted_idx = np.random.permutation(dataset.shape[0])
    total_n = dataset.shape[0]
    dataset = dataset[permuted_idx]
    test_idx = int(total_n * test_ratio)
    val_idx = int(total_n * (test_ratio + val_ratio))
    train, val, test = dataset[:-val_idx], dataset[-val_idx:-test_idx], dataset[-test_idx:]
    
    train, val, test = train.tolist(), val.tolist(), test.tolist()
    X_train = [data['abstract'] for data in train]
    y_train = [data['journal_id'] for data in train]
    X_val = [data['abstract'] for data in val]
    y_val = [data['journal_id'] for data in val]
    X_test = [data['abstract'] for data in test]
    y_test = [data['journal_id'] for data in test]
    
    splitted_data = {'X_train': X_train, 'y_train': y_train,
                    'X_val': X_val, 'y_val': y_val,
                    'X_test': X_test, 'y_test': y_test}
    np.save('../data/dataset_abstract_stat_50', splitted_data)
    
    
train_val_test_split(dataset)