In [2]:
from tqdm import tqdm, trange
import json
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import torch
import os
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models.word2vec import Word2Vec
import random
from scipy.sparse import csr_matrix
import dgl
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx, remove_self_loops,  from_networkx
import networkx as nx
import pickle as pkl

from k_core import myout

Using backend: pytorch


## format

In [2]:
dpath = '../../raw_data/aps/aps-dataset-metadata-2021/'
all_data = []
for journal in tqdm(os.listdir(dpath)): # [PR, PRA]
    files = os.listdir(dpath + journal + '/') # [1, 2, 3]
    for file in files:
        spath = dpath + journal + '/' + str(file)
        jfiles = os.listdir(spath) # ['PhysRevApplied.9.039901.json']
        for jfile in jfiles:
            with open(spath + '/' + jfile) as fr:
                lines = fr.readlines() # len = 1
            paper = json.loads(lines[0])
            all_data.append(paper)
len(all_data)

100%|██████████| 19/19 [09:39<00:00, 30.52s/it]


700035

In [None]:
"""
# APS
{'id': '10.1103/PhysRevApplied.9.039901', 
'title': {
	'value': 'Erratum: Spin Transport ', 
	'format': 'html+mathml'
	}, 
'publisher': {'name': 'APS'}, 
'journal': {
    'id': 'PRAPPLIED', 
	'abbreviatedName': 'Phys. Rev. Applied', 
	'name': 'Physical Review Applied'
           }, 
'issue': {'number': '3'}, 
'volume': {'number': '9'}, 
'pageStart': '039901', 
'hasArticleId': True, 
'date': '2018-03-06', 
'numPages': 2, 
'articleType': 'erratum', 
'identifiers': {'doi': '10.1103/PhysRevApplied.9.039901'}, 
'rights': {
	'rightsStatement': '© 2018 American Physical Society', 
	'copyrightYear': 2018, 
	'copyrightHolders': [{
		'type': 'organization', 
		'name': 'American Physical Society'
		}]
			}, 
 'authors': [{
      'type': 'Person', 
      'name': 'Tomoyuki Sasaki', 
      'firstname': 'Tomoyuki', 
      'surname': 'Sasaki'
       	 		}], 
 'classificationSchemes': {
       'subjectAreas': [{
       		'id': 'electronics', 
       		'label': 'Electronics'
       				}]
       	}
 }
"""

In [3]:
PAPER = namedtuple('Paper', ['title', 'p_id', 'date', 'aids', 'v_id', 'kids'])
VENUE = namedtuple('Venue', ['v_id','vraw', 'vname'])
AUTHOR = namedtuple('Author', ['aid', 'aname', 'type'])
KEYWORD = namedtuple('Keyword', ['kid', 'kname', 'label'])

cnt, cnt_ign = 0, 0
vdict = defaultdict(lambda: -1)
aname2aid, acnt = defaultdict(lambda: -1), 0
kname2kid, kcnt = defaultdict(lambda: -1), 0
Papers, Authors, Venues, Keys = [], [], [], []

tar = tqdm(all_data)
for paper in tar:
    try: title = str(paper['title']['value'])
    except: cnt_ign += 1; continue
    
    try: p_id = str(paper['id'])
    except: cnt_ign += 1; continue
    
    try: date = str(paper['date'])
    except: cnt_ign += 1; continue
    
    aids = ''
    if 'authors' in paper:
        for author in  paper['authors']:
            try: aname = str(author['name'])
            except: cnt_ign += 1; continue
            
            try: tp = str(author['type'])
            except: tp = ''
            
            if aname2aid['aname'] == -1:
                aname2aid[aname] = acnt
                Authors.append(AUTHOR(acnt, aname, tp))
                acnt += 1
            
            aid = aname2aid[aname]
            aids += f',{aid}' if aids != '' else f'{aid}'
    
    v_id = ''
    if 'journal' in paper and 'id' in paper['journal']:
        v_id = str(paper['journal']['id'])
        try: vraw = str(paper['journal']['abbreviatedName'])
        except: vraw = ''
        try: vname = str(paper['journal']['name'])
        except: vname = ''
        
        if vdict[v_id] == -1:
            vdict[v_id] = 1
            Venues.append(VENUE(v_id, vraw, vname))
            
    kids = ''
    if 'classificationSchemes' in paper and 'subjectAreas' in paper['classificationSchemes']:
        for key in paper['classificationSchemes']['subjectAreas']:
            try: kname = str(key['id'])
            except: cnt_ign += 1; continue
            try: label = str(key['label'])
            except: label = ''
            
            if kname2kid[kname] == -1:
                kname2kid[kname] = kcnt
                Keys.append(KEYWORD(kcnt, kname, label))
                kcnt += 1
            
            kid = kname2kid[kname]
            kids += f',{kid}' if kids != '' else f'{kid}'
        
    Papers.append(PAPER(title, p_id, date, aids, v_id, kids))
    cnt += 1
    if cnt%1000 == 0:
        tar.set_postfix(cnt=cnt, ign = cnt_ign)

100%|██████████| 700035/700035 [00:17<00:00, 39588.47it/s, cnt=7e+5, ign=0]  


## to_csv

### papers

In [4]:
df = pd.DataFrame(columns=['title', 'p_id', 'date', 'aids', 'v_id', 'kids'])
df['title'] = [p.title for p in Papers]
df['p_id'] = [p.p_id for p in Papers]
df['date'] = [p.date for p in Papers]
df['aids'] = [p.aids for p in Papers]
df['v_id'] = [p.v_id for p in Papers]
df['kids'] = [p.kids for p in Papers]
df.sort_values(by=['date', 'title'], inplace=True)
df

Unnamed: 0,title,p_id,date,aids,v_id,kids
90969,A Study of the Transmission Spectra of Certain...,10.1103/PhysRevSeriesI.1.1,1893-07-01,302635,PRI,
90970,New Books,10.1103/PhysRevSeriesI.1.66,1893-07-01,,PRI,
90981,Note. Geometrical Proof of the Three-Ammeter M...,10.1103/PhysRevSeriesI.1.59,1893-07-01,302641302642,PRI,
90991,Notes,10.1103/PhysRevSeriesI.1.62,1893-07-01,,PRI,
90994,On The Infra-red Spectra of the Alkalies,10.1103/PhysRevSeriesI.1.28,1893-07-01,302655,PRI,
...,...,...,...,...,...,...
364424,Thermodynamic uncertainty relations for many-b...,10.1103/PhysRevE.104.064141,2021-12-30,12838041283805,PRE,
560869,Tolman-Ehrenfest-Klein law in non-Riemannian g...,10.1103/PhysRevD.104.124089,2021-12-30,20087712008772,PRD,
364824,Transfer matrix method for light propagation i...,10.1103/PhysRevE.104.064702,2021-12-30,"1285151,1285152,1285153,1285154,1285155,128515...",PRE,
92443,Two-dimensional oxides assembled by <math xmln...,10.1103/PhysRevResearch.3.043231,2021-12-30,306334306335306336306337306338,PRRESEARCH,2305


In [14]:
citations = pd.read_csv('../../raw_data/aps/aps-dataset-citations-2021.csv')
citations

Unnamed: 0,citing_doi,cited_doi
0,10.1103/PhysRevSeriesI.11.215,10.1103/PhysRevSeriesI.1.1
1,10.1103/PhysRevSeriesI.12.121,10.1103/PhysRevSeriesI.1.166
2,10.1103/PhysRevSeriesI.7.93,10.1103/PhysRevSeriesI.1.166
3,10.1103/PhysRevSeriesI.16.267,10.1103/PhysRevSeriesI.2.35
4,10.1103/PhysRevSeriesI.17.65,10.1103/PhysRevSeriesI.2.112
...,...,...
9370281,10.1103/PhysRevB.105.L041402,10.1103/PhysRevResearch.3.043230
9370282,10.1103/PhysRevLett.128.072501,10.1103/PhysRevLett.127.272303
9370283,10.1103/PhysRevB.105.L081405,10.1103/PhysRevX.11.041063
9370284,10.1103/PhysRevX.12.011043,10.1103/PhysRevX.11.041063


In [15]:
pid2pid = defaultdict(lambda: '')
for i in trange(len(citations)):
    pid1, pid2 = citations['citing_doi'].iloc[i], citations['cited_doi'].iloc[i]
    ss = pid2pid[pid1]
    if ss == '':
        pid2pid[pid1] = str(pid2)
    else:
        pid2pid[pid1] += f',{pid2}'
myout(pid2pid)

100%|██████████| 9370286/9370286 [05:04<00:00, 30769.48it/s]

pid2pid : len=674344, dict([10.1103/PhysRevSeriesI.11.215: 10.1103/PhysRevSeriesI.1.1,10.1103/PhysRevSeriesI.2.161, 10.1103/PhysRevSeriesI.12.121: 10.1103/PhysRevSeriesI.1.166, 10.1103/PhysRevSeriesI.7.93: 10.1103/PhysRevSeriesI.1.166, 10.1103/PhysRevSeriesI.16.267: 10.1103/PhysRevSeriesI.2.35,10.1103/PhysRevSeriesI.14.38, 10.1103/PhysRevSeriesI.17.65: 10.1103/PhysRevSeriesI.2.112,10.1103/PhysRevSeriesI.2.161,10.1103/PhysRevSeriesI.13.257,10.1103/PhysRevSeriesI.13.345,10.1103/PhysRevSeriesI.15.306,10.1103/PhysRevSeriesI.16.94, 10.1103/PhysRevSeriesI.13.65: 10.1103/PhysRevSeriesI.2.260, ...])





In [25]:
df['refs'] = df['p_id'].map(pid2pid)
df['year'] = df['date'].apply(lambda u: int(u.split('-')[0]))
df

Unnamed: 0,title,p_id,date,aids,v_id,kids,refs,year
90969,A Study of the Transmission Spectra of Certain...,10.1103/PhysRevSeriesI.1.1,1893-07-01,302635,PRI,,,1893
90970,New Books,10.1103/PhysRevSeriesI.1.66,1893-07-01,,PRI,,,1893
90981,Note. Geometrical Proof of the Three-Ammeter M...,10.1103/PhysRevSeriesI.1.59,1893-07-01,302641302642,PRI,,,1893
90991,Notes,10.1103/PhysRevSeriesI.1.62,1893-07-01,,PRI,,,1893
90994,On The Infra-red Spectra of the Alkalies,10.1103/PhysRevSeriesI.1.28,1893-07-01,302655,PRI,,,1893
...,...,...,...,...,...,...,...,...
364424,Thermodynamic uncertainty relations for many-b...,10.1103/PhysRevE.104.064141,2021-12-30,12838041283805,PRE,,"10.1103/PhysRevE.49.2726,10.1103/PhysRevLett.8...",2021
560869,Tolman-Ehrenfest-Klein law in non-Riemannian g...,10.1103/PhysRevD.104.124089,2021-12-30,20087712008772,PRD,,"10.1103/PhysRev.35.904,10.1103/PhysRev.36.1791...",2021
364824,Transfer matrix method for light propagation i...,10.1103/PhysRevE.104.064702,2021-12-30,"1285151,1285152,1285153,1285154,1285155,128515...",PRE,,"10.1103/PhysRevLett.25.577,10.1103/PhysRevA.9....",2021
92443,Two-dimensional oxides assembled by <math xmln...,10.1103/PhysRevResearch.3.043231,2021-12-30,306334306335306336306337306338,PRRESEARCH,2305,"10.1103/PhysRev.140.A1133,10.1103/PhysRevLett....",2021


In [26]:
dataset = 'aps'
df.to_csv(f'../data/{dataset}/Papers.csv', index=False)

### authors, venues, keywords

In [7]:
adf = pd.DataFrame(columns=['aid', 'aname', 'type'])
adf['aid'] = [p.aid for p in Authors]
adf['aname'] = [p.aname for p in Authors]
adf['type'] = [p.type for p in Authors]
adf.sort_values(by=['aid', 'aname'], inplace=True)
adf

Unnamed: 0,aid,aname,type
0,0,Tomoyuki Sasaki,Person
1,1,Yuichiro Ando,Person
2,2,Makoto Kameno,Person
3,3,Takayuki Tahara,Person
4,4,Hayato Koike,Person
...,...,...,...
2480368,2480368,Mikhail Zobov,Person
2480369,2480369,A. Lorusso,Person
2480370,2480370,F. Gontad,Person
2480371,2480371,A. Perrone,Person


In [8]:
vdf = pd.DataFrame(columns=['v_id', 'vraw', 'vname'])
vdf['v_id'] = [p.v_id for p in Venues]
vdf['vraw'] = [p.vraw for p in Venues]
vdf['vname'] = [p.vname for p in Venues]
vdf['vraw'] = [p.vraw for p in Venues]
vdf.sort_values(by=['v_id', 'vraw'], inplace=True)
vdf

Unnamed: 0,v_id,vraw,vname
8,PR,Phys. Rev.,Physical Review
1,PRA,Phys. Rev. A,Physical Review A
5,PRAB,Phys. Rev. Accel. Beams,Physical Review Accelerators and Beams
0,PRAPPLIED,Phys. Rev. Applied,Physical Review Applied
12,PRB,Phys. Rev. B,Physical Review B
16,PRC,Phys. Rev. C,Physical Review C
15,PRD,Phys. Rev. D,Physical Review D
13,PRE,Phys. Rev. E,Physical Review E
9,PRFLUIDS,Phys. Rev. Fluids,Physical Review Fluids
2,PRI,Phys. Rev. (Series I),Physical Review (Series I)


In [9]:
kdf = pd.DataFrame(columns=['kid', 'kname', 'label'])
kdf['kid'] = [p.kid for p in Keys]
kdf['kname'] = [p.kname for p in Keys]
kdf['label'] = [p.label for p in Keys]
kdf.sort_values(by=['kid', 'kname'], inplace=True)
kdf

Unnamed: 0,kid,kname,label
0,0,electronics,Electronics
1,1,nano,Nanophysics
2,2,spintronics,Spintronics
3,3,cond-matt,Condensed Matter Physics
4,4,energy,Energy Research
5,5,materials,Materials Science
6,6,optoelectronics,Optoelectronics
7,7,photonics,Photonics
8,8,quantum-info,Quantum Information
9,9,astrophysics,Astrophysics


In [10]:
adf.to_csv(f'../data/{dataset}/Authors.csv', index=False)
vdf.to_csv(f'../data/{dataset}/Venues.csv', index=False)
kdf.to_csv(f'../data/{dataset}/Keywords.csv', index=False)

## train word2vec

In [12]:
embed_dim = 128
mpath = f'../word2vec/{dataset}_{embed_dim}.model'
if os.path.exists(mpath):
    model = Word2Vec.load(mpath)
else:
    sents = []
    for i in trange(len(df)):
        title = strip_non_alphanum(str(df['title'].iloc[i]).strip()).split()
        # abs = strip_non_alphanum(str(df['abst'].iloc[i]).strip()).split()
        sents.append(title)
        # sents.append(abs)
    # print(len(sents))

    model = Word2Vec(sents, vector_size= embed_dim, sg = 1)
    model.save(mpath)

In [19]:
p_id2emb = {}
for i in trange(len(df), desc='gen_pid2embed'):
    p_id = str(df['p_id'].iloc[i])
    # abs = strip_non_alphanum(str(df['abst'].iloc[i]).strip()).split()
    title = strip_non_alphanum(str(df['title'].iloc[i]).strip()).split()
    # lst = title + abs

    vals = []
    for ss in title:
        try:
            vals.append(model.wv[ss].astype(np.float32))
        except:
            pass
    # print(i, vals)
    embed = np.mean(np.vstack(vals), axis=0) if len(vals)>0 else np.zeros(embed_dim, dtype=np.float32)
    p_id2emb[p_id] = embed
myout(p_id2emb)

gen_pid2embed: 100%|██████████| 700035/700035 [02:19<00:00, 5004.88it/s]

p_id2emb : len=700035, dict([10.1103/PhysRevSeriesI.1.1: [ 6.37484789e-02 -1.56797662e-01 -5.31356275e-01 -2.94902354e-01
  2.31120452e-01  3.06525081e-01 -1.64697647e-01 -1.35186583e-01
  2.44683966e-01  1.04386955e-01  2.70847946e-01 -6.16349354e-02
  3.17574173e-01 -8.29862654e-02  1.76895708e-01 -1.39169414e-02
  1.54908419e-01 -4.55742106e-02 -1.03417464e-01  1.78361490e-01
 -1.57583714e-01  1.14076614e-01 -2.87553906e-01  1.59453943e-01
  1.34424781e-02  1.54614896e-01  8.59246105e-02  8.55579302e-02
 -1.00014448e-01 -8.96584243e-02  4.31428775e-02 -2.55747199e-01
 -3.47675048e-02  2.10791662e-01 -1.00793615e-01  2.39948973e-01
  1.22995190e-01  1.72893703e-01  2.27474242e-01  1.66270465e-01
 -1.22651057e-02 -1.42175406e-01 -1.30287170e-01  1.24966584e-01
  6.04621395e-02  9.48932301e-03  6.88199252e-02  1.25183612e-01
  1.06229082e-01  1.87827960e-01  2.43721791e-02  1.71982035e-01
 -1.64385721e-01  1.04352988e-01  2.46298879e-01  8.15109089e-02
 -1.89954434e-02  8.51822197e-02 




In [22]:
pkl.dump(p_id2emb, open(f'../data/{dataset}/p_id2emb.pkl', 'wb'))

## build graph

In [None]:
df = pd.read_csv(f'../data/{dataset}/Papers.csv')
df.dropna(subset=['p_id', 'year'], inplace=True)
df = df.fillna({'refs': '', 'aids': '', 'v_id': '', 'kids': ''})
df

In [5]:
dataset = 'aps'
p_id2emb = pkl.load(open(f'../data/{dataset}/p_id2emb.pkl', 'rb'))
myout(p_id2emb)
def update_idx(idx, dic, cnt, feats, rel, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        if rel == 0:
            try:
                feats.append(torch.from_numpy(p_id2emb[idx]).to(torch.float32))
            except:
                feats.append(torch.rand(feat_dim).to(torch.float32)-1)
                no_emb += 1
        elif rel == 1:
            feats.append(torch.rand(feat_dim).to(torch.float32)+1)
        elif rel == 2:
            feats.append(torch.rand(feat_dim).to(torch.float32)+2)
        elif rel == 3:
            feats.append(torch.rand(feat_dim).to(torch.float32))
    return dic, cnt, feats, no_emb

p_id2emb : len=700035, dict([10.1103/PhysRevSeriesI.1.1: [ 6.37484789e-02 -1.56797662e-01 -5.31356275e-01 -2.94902354e-01
  2.31120452e-01  3.06525081e-01 -1.64697647e-01 -1.35186583e-01
  2.44683966e-01  1.04386955e-01  2.70847946e-01 -6.16349354e-02
  3.17574173e-01 -8.29862654e-02  1.76895708e-01 -1.39169414e-02
  1.54908419e-01 -4.55742106e-02 -1.03417464e-01  1.78361490e-01
 -1.57583714e-01  1.14076614e-01 -2.87553906e-01  1.59453943e-01
  1.34424781e-02  1.54614896e-01  8.59246105e-02  8.55579302e-02
 -1.00014448e-01 -8.96584243e-02  4.31428775e-02 -2.55747199e-01
 -3.47675048e-02  2.10791662e-01 -1.00793615e-01  2.39948973e-01
  1.22995190e-01  1.72893703e-01  2.27474242e-01  1.66270465e-01
 -1.22651057e-02 -1.42175406e-01 -1.30287170e-01  1.24966584e-01
  6.04621395e-02  9.48932301e-03  6.88199252e-02  1.25183612e-01
  1.06229082e-01  1.87827960e-01  2.43721791e-02  1.71982035e-01
 -1.64385721e-01  1.04352988e-01  2.46298879e-01  8.15109089e-02
 -1.89954434e-02  8.51822197e-02 

In [6]:
start_year, end_year = 2000, 2022
feat_dim = 128

pid2rid, cnt, no_emb = {}, 0, 0
lst, feats = [], []
for i in trange(len(df)):
    ts = int(df['year'].iloc[i])
    p_id = str(df['p_id'].iloc[i])
    refs = df['refs'].iloc[i]
    
    pid2rid, cnt, feats, no_emb = update_idx(p_id, pid2rid, cnt, feats, 0, feat_dim, no_emb)
    if len(refs)>0:
        rlst = refs.split(',')
        for ref in rlst:
            if ref != '':
                ref = str(ref)
                pid2rid, cnt, feats, no_emb = update_idx(ref, pid2rid, cnt, feats, 0, feat_dim, no_emb)
                lst.append((pid2rid[p_id], pid2rid[ref], 0, ts))
        
feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])
myout(feat, src, tgt, rel, tsp, no_emb)

100%|██████████| 700035/700035 [00:58<00:00, 11971.43it/s]


feat : shape=torch.Size([700492, 128])
tensor([[ 0.0637, -0.1568, -0.5314,  ..., -0.1144,  0.1223,  0.2986],
        [-0.0521, -0.5351, -0.5705,  ..., -0.0148,  0.3100,  0.1798],
        [-0.1255, -0.1741, -0.3433,  ..., -0.1493,  0.1544,  0.1175],
        ...,
        [-0.0355, -0.1025,  0.0216,  ..., -0.2168,  0.0292,  0.0406],
        [-0.0477, -0.3570, -0.1027,  ..., -0.3685,  0.0045,  0.4755],
        [-0.0142, -0.4941, -0.2021,  ..., -0.3956,  0.0186,  0.4762]])
src : shape=torch.Size([9274857]), tensor([    39,    181,    208,  ..., 700491, 700491, 700491])
tgt : shape=torch.Size([9274857]), tensor([    40,     91,    143,  ..., 665309, 665576, 671551])
rel : shape=torch.Size([9274857]), tensor([0, 0, 0,  ..., 0, 0, 0])
tsp : shape=torch.Size([9274857]), tensor([1894, 1897, 1898,  ..., 2021, 2021, 2021])
no_emb = 457


In [27]:
start_year, end_year = 2000, 2022
feat_dim = 128

pid2rid, cnt, no_emb = {}, 0, 0
lst, feats = [], []
for i in trange(len(df)):
    ts = int(df['year'].iloc[i])
    p_id = str(df['p_id'].iloc[i])
    refs = df['refs'].iloc[i]
    
    pid2rid, cnt, feats, no_emb = update_idx(p_id, pid2rid, cnt, feats, 0, feat_dim, no_emb)
    if len(refs)>0:
        rlst = refs.split(',')
        for ref in rlst:
            if ref != '':
                ref = str(ref)
                pid2rid, cnt, feats, no_emb = update_idx(ref, pid2rid, cnt, feats, 0, feat_dim, no_emb)
                lst.append((pid2rid[p_id], pid2rid[ref], 0, ts))
        
feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])
myout(feat, src, tgt, rel, tsp)

100%|██████████| 700035/700035 [01:04<00:00, 10849.79it/s]


feat : shape=torch.Size([700492, 128])
tensor([[ 0.0637, -0.1568, -0.5314,  ..., -0.1144,  0.1223,  0.2986],
        [-0.0521, -0.5351, -0.5705,  ..., -0.0148,  0.3100,  0.1798],
        [-0.1255, -0.1741, -0.3433,  ..., -0.1493,  0.1544,  0.1175],
        ...,
        [-0.0355, -0.1025,  0.0216,  ..., -0.2168,  0.0292,  0.0406],
        [-0.0477, -0.3570, -0.1027,  ..., -0.3685,  0.0045,  0.4755],
        [-0.0142, -0.4941, -0.2021,  ..., -0.3956,  0.0186,  0.4762]])
src : shape=torch.Size([9274857]), tensor([    39,    181,    208,  ..., 700491, 700491, 700491])
tgt : shape=torch.Size([9274857]), tensor([    40,     91,    143,  ..., 665309, 665576, 671551])
rel : shape=torch.Size([9274857]), tensor([0, 0, 0,  ..., 0, 0, 0])
tsp : shape=torch.Size([9274857]), tensor([1894, 1897, 1898,  ..., 2021, 2021, 2021])


In [28]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat
graph.ndata['raw_nid'] = torch.arange(len(feat))

graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=700492, num_edges=9274857,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64)})

In [29]:
dgl.save_graphs(f'../data/{dataset}/raw_graph.bin', [graph])
json.dump(pid2rid, open(f'../data/{dataset}/pid2rid.json', 'w'))

## k-core

In [30]:
train_start, valid_end = 2000, 2012
ts_eids = graph.filter_edges(lambda x: (x.data['ts']>=train_start) & (x.data['ts']<valid_end))
ts_graph = dgl.edge_subgraph(graph, ts_eids)
ts_graph

Graph(num_nodes=359367, num_edges=2881569,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)})

In [None]:
g = Data(feat=ts_graph.ndata['feat'], edge_index=torch.stack(ts_graph.edges()), ts=ts_graph.edata['ts'], \
    raw_nid=ts_graph.ndata['raw_nid'])
G = to_networkx(g, node_attrs=['feat', 'raw_nid'], edge_attrs=['ts'], remove_self_loops=True)
g

In [32]:
g_core = nx.k_core(G, k=13)
core_dg = dgl.from_networkx(g_core, node_attrs=['feat', 'raw_nid'], edge_attrs=['ts'])
core_dg, core_dg.edata['ts'].unique()

(Graph(num_nodes=94587, num_edges=1421331,
       ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)}),
 tensor([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]))

## sort graph

In [34]:
def sort_edges(graph, by=['_SRC', '_TGT']):
    """Sort edges of a dgl.graph by edge_attrs, where '_SRC' and '_TGT' are the index of source nodes and target nodes, respectively.

    Args:
        graph (dgl.graph): _description_
        by (list, optional): edge attributes. Defaults to ['_SRC', '_TGT'].

    Returns:
        _type_: dgl.graph
    """
    df = pd.DataFrame({'_SRC': graph.edges()[0].numpy(), '_TGT': graph.edges()[1].numpy()})
    for name, efeat in graph.edata.items():
        df[name] = efeat.numpy()
    df = df.sort_values(by=by)
    
    new_graph = dgl.graph((torch.tensor(df['_SRC'].tolist()), torch.tensor(df['_TGT'].tolist())), \
        num_nodes = graph.number_of_nodes())
    for name, efeat in graph.edata.items():
        new_graph.edata[name] = torch.tensor(df[name].tolist())
    for name, nfeat in graph.ndata.items():
        new_graph.ndata[name] = nfeat
    return new_graph


def relabel_nodes(graph, mapping):
    """Relabel the nodes of the dgl.graph according to a given mapping.

    Args:
        graph (dgl.graph): _description_
        mapping (dictionary): _description_

    Returns:
        graph (dgl.graph): _description_
    """
    src_lst = graph.edges()[0].tolist()
    tgt_lst = graph.edges()[1].tolist()
    new_src = torch.tensor([mapping[item] for item in src_lst])
    new_tgt = torch.tensor([mapping[item] for item in tgt_lst])
    
    new_graph = dgl.graph((new_src, new_tgt))
    for name, efeat in graph.edata.items():
        new_graph.edata[name] = efeat
    for name, nfeat in graph.ndata.items():
        new_graph.ndata[name] = nfeat
    return new_graph

In [36]:
core_dg.edges()

(tensor([    0,     0,     0,  ..., 81021, 81021, 81021]),
 tensor([81029, 81030, 81031,  ..., 77176, 77193, 78364]))

In [37]:
core_dg = sort_edges(core_dg, by=['ts', '_SRC'])
core_dg.edges()

(tensor([    0,     0,     0,  ..., 81021, 81021, 81021]),
 tensor([81029, 81030, 81031,  ..., 77176, 77193, 78364]))

In [39]:
src_lst = core_dg.edges()[0].tolist()
tgt_lst = core_dg.edges()[1].tolist()
cur2new, cnt = {}, 0
for src,tgt in tqdm(zip(src_lst, tgt_lst)):
    if src not in cur2new:
        cur2new[src] = cnt
        cnt += 1
    if tgt not in cur2new:
        cur2new[tgt] = cnt
        cnt += 1
core_dg = relabel_nodes(core_dg, cur2new)
core_dg.edges()

(tensor([    0,     0,     0,  ..., 94586, 94586, 94586]),
 tensor([    1,     2,     3,  ..., 90741, 90758, 91930]))

In [40]:
dgl.save_graphs(f'../data/{dataset}/graph.bin', [core_dg])

## build cites

In [4]:
dataset = 'aps'
core_dg = dgl.load_graphs(f'../data/{dataset}/graph.bin')[0][0]
core_dg, core_dg.edges()

(Graph(num_nodes=94587, num_edges=1421331,
       ndata_schemes={'raw_nid': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32)}
       edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)}),
 (tensor([    0,     0,     0,  ..., 94586, 94586, 94586]),
  tensor([    1,     2,     3,  ..., 90741, 90758, 91930])))

In [8]:
pid2rid = json.load(open(f'../data/{dataset}/pid2rid.json', 'r'))
myout(pid2rid)

pid2rid : len=700492, dict([10.1103/PhysRevSeriesI.1.1: 0, 10.1103/PhysRevSeriesI.1.66: 1, 10.1103/PhysRevSeriesI.1.59: 2, 10.1103/PhysRevSeriesI.1.62: 3, 10.1103/PhysRevSeriesI.1.28: 4, 10.1103/PhysRevSeriesI.1.19: 5, ...])


In [43]:
def build_cites(dgl_graph):
    # 1. Calculate the citation for each year
    tgt_lst = dgl_graph.edges()[1].tolist()
    tsp_lst = dgl_graph.edata['ts'].tolist()
    start_year, end_year = dgl_graph.edata['ts'].min().item(), dgl_graph.edata['ts'].max().item()+1
    
    cites = {}
    for year in range(start_year, end_year):
        cites[year] = defaultdict(int)
    
    for year, tgt in zip(tsp_lst, tgt_lst):
        cites[year][tgt] += 1

    # 2. Calculate the index range corresponding to each year
    ts_vals, ts_cuts = np.unique(tsp_lst, return_index=True)
    ts_cuts = list(ts_cuts) + [len(tsp_lst)]
    ts_infos = np.stack([ts_vals, ts_cuts[0:len(ts_vals)], ts_cuts[1:len(ts_vals)+1]]).transpose()
    
    # 3. Generate citations of papers for each year in all future years
    nid2id = {nid:idd for nid, idd in enumerate(dgl_graph.ndata['raw_nid'].tolist())}
    labels = {}
    for year in range(start_year, end_year):
        left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
        nids = dgl_graph.edges()[0][left:right].unique().tolist()
        ids = [nid2id[nid] for nid in nids]
        
        pdf = pd.DataFrame({'id': ids, 'nid': nids})
        tbar = trange(year+1, end_year, desc=str(year))
        for yy in tbar:
            cdf = pd.DataFrame({'nid': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
            cdf[str(yy)] = cdf[str(yy)].astype('float32')
            
            pdf = pd.merge(pdf, cdf, how='left', on='nid')
            tbar.set_postfix(year=year, pdf=len(pdf))
        pdf.fillna(0, inplace=True)
        labels[year] = pdf
    
    # 4. Calculate the cumulative citations and smooth them by taking log
    def cumulative_log(df):
        colsn = list(df.columns)
        for i in range(3, len(colsn)):
            df[colsn[i]] += df[colsn[i-1]]
        df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
        return df

    labels_cum_log = {}
    for year in range(start_year, end_year-2): # 2000, 2020
        labels_cum_log[year] = cumulative_log(labels[year])
    labels_cum_log[end_year-2] = labels[end_year-2]
    
    return labels_cum_log

In [5]:
def build_cites(dgl_graph):
    # 1. Calculate the citation for each year
    tgt_lst = dgl_graph.edges()[1].tolist()
    tsp_lst = dgl_graph.edata['ts'].tolist()
    start_year, end_year = dgl_graph.edata['ts'].min().item(), dgl_graph.edata['ts'].max().item()+1
    
    year_nid2cites = {}
    for year in range(start_year, end_year):
        year_nid2cites[year] = defaultdict(int)
    
    for year, tgt in zip(tsp_lst, tgt_lst):
        year_nid2cites[year][tgt] += 1

    # 2. Calculate the index range corresponding to each year
    ts_vals, ts_cuts = np.unique(tsp_lst, return_index=True)
    ts_cuts = list(ts_cuts) + [len(tsp_lst)]
    ts_infos = np.stack([ts_vals, ts_cuts[0:len(ts_vals)], ts_cuts[1:len(ts_vals)+1]]).transpose()
    
    # 3. Generate citations of papers for each year in all future years
    nid2rid = {nid:rid for nid, rid in enumerate(dgl_graph.ndata['raw_nid'].tolist())}
    cites = {}
    for year in range(start_year, end_year):
        left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
        nids = dgl_graph.edges()[0][left:right].unique().tolist()
        rids = [nid2rid[nid] for nid in nids]
        
        pdf = pd.DataFrame({'rid': rids, 'nid': nids})
        tbar = trange(year+1, end_year, desc=str(year))
        for yy in tbar:
            cdf = pd.DataFrame({'nid': list(year_nid2cites[yy].keys()), str(yy): list(year_nid2cites[yy].values())})
            cdf[str(yy)] = cdf[str(yy)].astype('float32')
            
            pdf = pd.merge(pdf, cdf, how='left', on='nid')
            tbar.set_postfix(year=year, pdf=len(pdf))
        pdf.fillna(0, inplace=True)
        cites[year] = pdf
    
    # 4. Calculate the cumulative citations and smooth them by taking log
    def cumulative_log(df):
        colsn = list(df.columns)
        for i in range(3, len(colsn)):
            df[colsn[i]] += df[colsn[i-1]]
        df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
        return df

    cites_cum_log = {}
    for year in range(start_year, end_year-1): # 2000, 2020
        cites_cum_log[year] = cumulative_log(cites[year])
    cites_cum_log[end_year-1] = cites[end_year-1]
    
    return cites_cum_log

In [6]:
labels_cum_log = build_cites(core_dg)

2000: 100%|██████████| 11/11 [00:00<00:00, 30.05it/s, pdf=4480, year=2000]
2001: 100%|██████████| 10/10 [00:00<00:00, 31.82it/s, pdf=5128, year=2001]
2002: 100%|██████████| 9/9 [00:00<00:00, 29.38it/s, pdf=5664, year=2002]
2003: 100%|██████████| 8/8 [00:00<00:00, 29.06it/s, pdf=5697, year=2003]
2004: 100%|██████████| 7/7 [00:00<00:00, 27.81it/s, pdf=6572, year=2004]
2005: 100%|██████████| 6/6 [00:00<00:00, 27.60it/s, pdf=7379, year=2005]
2006: 100%|██████████| 5/5 [00:00<00:00, 26.81it/s, pdf=7440, year=2006]
2007: 100%|██████████| 4/4 [00:00<00:00, 25.94it/s, pdf=7392, year=2007]
2008: 100%|██████████| 3/3 [00:00<00:00, 24.68it/s, pdf=8198, year=2008]
2009: 100%|██████████| 2/2 [00:00<00:00, 25.54it/s, pdf=7853, year=2009]
2010: 100%|██████████| 1/1 [00:00<00:00, 22.78it/s, pdf=7857, year=2010]
2011: 0it [00:00, ?it/s]


In [7]:
labels_cum_log[2005]

Unnamed: 0,rid,nid,2006,2007,2008,2009,2010,2011
0,404245,41025,0.693147,1.098612,1.098612,1.609438,1.791759,1.945910
1,404246,41026,0.693147,1.609438,2.079442,2.302585,2.302585,2.397895
2,404249,41027,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147
3,404250,41028,0.000000,1.098612,1.609438,1.609438,1.609438,1.609438
4,404251,41029,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
7374,422148,48453,0.693147,0.693147,1.386294,1.609438,1.609438,1.609438
7375,422155,48454,0.000000,0.000000,0.000000,0.000000,0.693147,0.693147
7376,422157,48455,0.000000,0.693147,0.693147,1.098612,1.098612,1.098612
7377,422158,48456,0.000000,0.693147,0.693147,1.098612,1.098612,1.386294


In [46]:
pkl.dump(labels_cum_log, open(f'../data/{dataset}/labels_cum_log.pkl', 'wb'))