In [1]:
from tqdm import tqdm, trange
import json
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import torch
import os
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models.word2vec import Word2Vec
import random
from scipy.sparse import csr_matrix
import dgl
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx, remove_self_loops,  from_networkx
import networkx as nx

import pickle as pkl
from k_core import myout

KeyboardInterrupt: 

In [4]:
f = open('../../raw_data/dblp/dblpv13.compress.json', 'r')
all_data = f.readlines()
f.close()
len(all_data)

5354309

## format

In [None]:
"""
{ 
"_id" : "53e99784b7602d9701f3e133", 
"title" : "The relationship between canopy parameters and spectrum of winter wheat under different irrigations in Hebei Province.", 
"authors" : [ { "_id" : "53f45728dabfaec09f209538", "name" : "Peijuan Wang" }, 
            { "_id" : "5601754345cedb3395e59457", "name" : "Jiahua Zhang" }, 
            { "_id" : "53f38438dabfae4b34a08928", "name" : "Donghui Xie" }, 
            { "_id" : "5601754345cedb3395e5945a", "name" : "Yanyan Xu" }, 
            { "_id" : "53f43d25dabfaeecd6995149", "name" : "Yun Xu" } ], 
"venue" : { "_id" : "53a7297d20f7420be8bd4ae7", 
            "name_d" : "International Geoscience and Remote Sensing Symposium",
            "type" : 0, 
            "raw" : "IGARSS" }, 
"year" : 2011, 
"keywords" : [ "canopy parameters", "canopy spectrum", ], 
"fos" : [ "Agronomy", "Moisture"], 
"n_citation" : 0,
"page_start" : "1930", 
"page_end" : "1933",
"lang" : "en", 
"volume" : "null", 
"issue" : "null",
"issn" : "", 
"isbn" : "", 
"doi" : "10.1109/IGARSS.2011.6049503", 
"pdf" : null, 
"url" : [ "http://dx.doi.org/10.1109/IGARSS.2011.6049503" ], 
"abstract" : "Drought is the first place in all the natural disasters in the world.",
'references': ['53e99cf5b7602d97025ace63', '557e8a7a6fee0fe990caa63d']
}
"""

In [20]:
PAPER = namedtuple('Paper', ['title', 'p_id', 'year', 'ncites', 'refs', 'a_ids', 'v_id', 'fids', 'kids', 'abst'])
VENUE = namedtuple('Venue', ['vraw', 'v_id','vname', 'vtype'])
AUTHOR = namedtuple('Author', ['aname', 'a_id'])
FOS = namedtuple('Fos', ['fname', 'fid'])
KEYWORD = namedtuple('Keyword', ['kname', 'kid'])
cnt, cnt_ign = 0, 0
adict, vdict = defaultdict(lambda: -1), defaultdict(lambda: -1)
fname2fid, fcnt = defaultdict(lambda: -1), 0
kname2kid, kcnt = defaultdict(lambda: -1), 0

Papers, Authors, Venues, Foses, Keys = [], [], [], [], []
tar = tqdm(all_data)
for data in tar:
    paper = json.loads(data)
    try:
        title = str(paper['title'])
        p_id = str(paper['_id'])
        year = int(paper['year'])
    except:
        cnt_ign += 1; continue
    refs = ''
    if 'references' in paper:
        for ref in paper['references']:
            refs += f',{ref}' if refs != '' else f'{ref}'     
    a_ids = ''
    if 'authors' in paper:
        for author in paper['authors']:
            try:
                a_id = str(author['_id'])
                aname = str(author['name'])
            except:
                continue
            
            if adict[a_id] == -1:
                adict[a_id] = 1
                Authors.append(AUTHOR(aname, a_id))
    v_id = ''
    if 'venue' in paper:
        venue = paper['venue']
        try: v_id = str(venue['_id'])
        except: v_id = ''
        try: vraw = str(venue['raw'])
        except: vraw = ''
        try: vname = str(venue['name_d'])
        except: vname = ''
        try: vtype = int(venue['type'])
        except: vtype = ''
        
        if vdict[v_id] == -1:
            vdict[v_id] = 1
            Venues.append(VENUE(vraw, v_id, vname, vtype))
    fids = ''
    if 'fos' in paper:
        for fname in paper['fos']:
            fname = fname.lower()
            if fname2fid[fname] == -1:
                fname2fid[fname] = fcnt
                Foses.append(FOS(fname, fcnt))
                fcnt += 1
            fid = fname2fid[fname]
            fids += f',{fid}' if fids != '' else f'{fid}'
    kids = ''
    if 'keywords' in paper:
        for kname in paper['keywords']:
            kname = kname.lower()
            if kname2kid[kname] == -1:
                kname2kid[kname] = kcnt
                Keys.append(KEYWORD(kname, kcnt))
                kcnt += 1
            kid = kname2kid[kname]
            kids += f',{kid}' if kids != '' else f'{kid}'
    
    try: ncites = int(paper['n_citation'])
    except: ncites = -1    
    try: abst = str(paper['abstract'])
    except: abst = ''
    
    Papers.append(PAPER(title, p_id, year, ncites, refs, a_ids, v_id, fids, kids, abst))
    cnt += 1
    if cnt%100000 == 0:
        tar.set_postfix(cnt=cnt, ign = cnt_ign)

100%|██████████| 5354309/5354309 [12:31<00:00, 7122.88it/s, cnt=5.3e+6, ign=405]  


## to _csv

### papers

In [22]:
df = pd.DataFrame(columns=['title', 'p_id', 'year', 'ncites', 'refs', 'a_ids', 'v_id', 'fids', 'kids', 'abst'])
df['title'] = [p.title for p in Papers]
df['p_id'] = [p.p_id for p in Papers]
df['year'] = [p.year for p in Papers]
df['ncites'] = [p.ncites for p in Papers]
df['refs'] = [p.refs for p in Papers]
df['a_ids'] = [p.a_ids for p in Papers]
df['v_id'] = [p.v_id for p in Papers]
df['fids'] = [p.fids for p in Papers]
df['kids'] = [p.kids for p in Papers]
df['abst'] = [p.abst for p in Papers]
df.sort_values(by=['year', 'title'], inplace=True)
df

Unnamed: 0,title,p_id,year,ncites,refs,a_ids,v_id,fids,kids,abst
3090349,&quot;Independence&quot; Day.,565b055b0cf28eea9cbf8bdd,0,0,,,55fa7cc5c35f4fb0d21d515e,,,
3979998,"1st Annual Conference on Robot Learning, CoRL ...",5a260c2e17c44a4ba8a23f3e,0,0,,,,,,
692521,3D Virtual Spaces Supporting Engineering Learn...,53e9a209b7602d9702b0d738,0,4,"53e9ae76b7602d9703894bab,53e9a27ab7602d9702b80...",,,59236145362131478843137642044090,3315718884438262718884441675,Virtual environments constitute the support pl...
2111557,60 Years from Birth of Academician F.G. Filip,53e9b87fb7602d9704451fb4,0,0,53e9a8dbb7602d97032286e5,,,131470520665,,
2680884,8th Mediterranean Conference on Information Sy...,5550467245ce0a409eb5e8c6,0,0,,,5550375f7cea80f9541835e2,,,
...,...,...,...,...,...,...,...,...,...,...
2905452,QoS-Guaranteed Path Selection Algorithm for Se...,558acb3f84ae84d265bfac09,2300,11,"53e99931b7602d970216ce29,53e9aacab7602d9703448...",,55f969d1c35f4fb0d21cf321,"1485,638,1201,13,47,95,142,698,1171,340,52,380...","4848,99259,126,124,3942,6267293,6267294,626729...",Service overlay networking is an emerging appr...
1745265,Stochastic Analysis and File Availability Enha...,53e9b2f5b7602d9703db387c,2300,35,"53e9a37ab7602d9702c8997f,53e9ad63b7602d970374b...",,,909917782877591001314909564007558945652,"259444,3849,14290,17471,2314589,101187,214617,...","In this paper, we present the mathematical ana..."
450744,Supporting Loss Guarantees in Buffer-Limited N...,53e99e28b7602d97026ebec5,2300,8,"557d34e66feeaa8086da8181,558ac89fe4b0b32fcb38d...",,,3054172129713805953336978116448,"53561,44433,4862,46752,1088148,1294314,1294315...",We consider the problem of packet scheduling i...
2333294,Driving Pattern Analysis for Electric Vehicle ...,53e9bbeab7602d970484167a,8509,97,,,,65491433399742779544049302260372459,"439802,297196,34577,35,5100506,16357,87595,510...",In order to facilitate the integration of elec...


In [23]:
df.drop(df[df.title==''].index, inplace=True)
df.drop(df[df.p_id==''].index, inplace=True)

start_year, end_year = 1600, 2022
drop_id = df[(df.year < start_year) | (df.year >= end_year)].index
print(f'Drop {len(drop_id)} rows') # 602
df.drop(drop_id, inplace=True) # 5353302
df

Drop 602 rows


Unnamed: 0,title,p_id,year,ncites,refs,a_ids,v_id,fids,kids,abst
3152029,Kiri Jacob Emile Pereire'le,56d85b67dabfae2eee51d618,1800,0,"53e9b923b7602d970450d298,5736970a6e3b12023e604...",,,163516361316308073956771095859332508152,6891771,
3146237,"[Les Désastres de la guerre, pl. 56 : Al cemen...",56d851a9dabfae2eee07e585,1815,0,53e9ab89b7602d97035361c8,,,290262908112384136541482203205,,
3183012,"Gazeta Warszawska. 1822, nr 24 + dod.",56d88f76dabfae2eeedb33e2,1822,0,"558c1c7184ae6766fdf11d9f,53e9bd23b7602d97049ae...",,,8745127372457298678542846341121120,3007814941726917719374098364981599,
3149616,Urkunden und Abhandlungen zur Geschichte des N...,56d85754dabfae2eee32f23c,1824,0,"53e99e3eb7602d9702703684,53e9acd3b7602d97036b1...",,,"11159,1181,5845,1068,13714,1067,11085,313,2210...",,
3134703,"Kurjer Warszawski. 1831, nr 114",56d83ebedabfae2eee76ccac,1831,0,"53e9bc05b7602d97048649c8,53e9aefcb7602d970392e...",,,176951551666011384129310143392516877887,3007836498177193717269409815991494,
...,...,...,...,...,...,...,...,...,...,...
5323369,“Did you know this camera tracks your mood?”: ...,603877169e795ea1fb778245,2021,0,,,,,,
5323370,“Did you know this camera tracks your mood?”: ...,603877169e795ea1fb778246,2021,0,,,,,,
5342984,“Relationship Between Learning by Teaching wit...,60703a26e4510cd7c8b7ffb7,2021,-1,,,,,899811531043247812088851485961936,Learning through teaching robots has been show...
5287944,ℓp Subspace Embedding in Input Sparsity Time.,6008039a91e011f078795c97,2021,-1,"53e9a7c1b7602d97031002a4,53e9b20cb7602d9703ca7...",,,1976182428120,,


In [24]:
df.to_csv('../data/dblp/Papers.csv', index=False)

## authors, venue, keywords

In [25]:
adf = pd.DataFrame(columns=['a_id', 'aname'])
adf['a_id'] = [p.a_id for p in Authors]
adf['aname'] = [p.aname for p in Authors]
adf.sort_values(by=['aname', 'a_id'], inplace=True)
adf

Unnamed: 0,a_id,aname
2096334,542d6daadabfae11fc46dd35,
1993794,53f44a53dabfaee2a1d44896,3rd Author
1885448,540fca81dabfae450f4a51e6,A. Baz
1963441,53f47e9ddabfaee43ed515b2,A.Tamilarasi
1379181,53f461f5dabfaec09f22e1e4,ACM Case Study
...,...,...
900047,53f43cbfdabfaee02ad0036d,�ric Stindel
1428540,53f4775adabfaee02add7cba,�rica Daiuto
814256,53f437ecdabfaedce553d34c,�rp�d I. Csurgay
2002711,53f45a29dabfaedf43619b70,�scar Lopez


In [26]:
vdf = pd.DataFrame(columns=['vraw', 'v_id','vname', 'vtype'])
vdf['v_id'] = [p.v_id for p in Venues]
vdf['vname'] = [p.vname for p in Venues]
vdf['vtype'] = [p.vtype for p in Venues]
vdf['vraw'] = [p.vraw for p in Venues]
vdf.sort_values(by=['vraw', 'vname'], inplace=True)
vdf

Unnamed: 0,vraw,v_id,vname,vtype
0,,,,0
2692,,539544b48314d630bc7d7d02,,
11026,,53e72c5320f7682861393e51,,
21777,,53907e0720f770854f610946,,
31623,,539544d98314d630bc7d7dac,,
...,...,...,...,...
16276,自然科学进展(英文版),5395464b8314d630bc7e4784,International Conference on Control Applications,0
10424,计算机科学技术学报(英文版),539548758314d630bc809dac,,
33192,计算机辅助绘图设计与制造(英文版),53954af18314d630bc886cf3,International Symposium on Neural Networks,0
2579,重庆大学学报(英文版),539548808314d630bc80c4de,,


In [27]:
fdf = pd.DataFrame(columns=['fid', 'fname'])
fdf['fid'] = [p.fid for p in Foses]
fdf['fname'] = [p.fname for p in Foses]
fdf.sort_values(by=['fname', 'fid'], inplace=True)
fdf

Unnamed: 0,fid,fname
9930,9930,#p-complete
7645,7645,#sat
128825,128825,% abnormal forms
23280,23280,% area reduction
48503,48503,% diameter reduction
...,...,...
165517,165517,σ-compact space
19518,19518,σ-finite measure
103458,103458,ω conotoxin gvia
7598,7598,ω-automaton


In [28]:
kdf = pd.DataFrame(columns=['kid', 'kname'])
kdf['kid'] = [p.kid for p in Keys]
kdf['kname'] = [p.kname for p in Keys]
kdf.sort_values(by=['kname', 'kid'], inplace=True)
kdf

Unnamed: 0,kid,kname
3546,3546,
7355252,7355252,\n\n\n\n distance
7433449,7433449,\n\n\n\n filtering
7469348,7469348,\n\n\n\n matrix
7397757,7397757,\n\n\n\n minimization
...,...,...
434788,434788,�÷ image segmentation
2504007,2504007,��-nearest neighbor (k-nn)
1531824,1531824,��-pseudomonotone mapping
1531825,1531825,��-strongly pseudomonotone mapping


In [29]:
adf.to_csv('../data/dblp/Authors.csv', index=False)
vdf.to_csv('../data/dblp/Venues.csv', index=False)
fdf.to_csv('../data/dblp/Fields.csv', index=False)
kdf.to_csv('../data/dblp/Keywords.csv', index=False)

## train word2vec

In [31]:
embed_dim = 128
dataset = 'dblp'
mpath = f'../word2vec/{dataset}_{embed_dim}.model'
if os.path.exists(mpath):
    model = Word2Vec.load(mpath)
else:
    sents = []
    for i in trange(len(df)):
        title = strip_non_alphanum(str(df['title'].iloc[i]).strip()).split()
        abs = strip_non_alphanum(str(df['abst'].iloc[i]).strip()).split()
        sents.append(title)
        sents.append(abs)
    # print(len(sents))

    model = Word2Vec(sents, vector_size= embed_dim, sg = 1)
    model.save(mpath)

In [30]:
start_year, end_year = 2000, 2022
drop_id = df[(df.year < start_year) | (df.year >= end_year)].index
print(f'Drop {len(drop_id)} rows') # 559749
papers = df.drop(drop_id) # 3547589
papers

Drop 655057 rows


Unnamed: 0,title,p_id,year,ncites,refs,a_ids,v_id,fids,kids,abst
1881147,Human-Robot Interaction through Spoken Langua...,53e9b4f4b7602d970401fd57,2000,23,"53e9bc42b7602d97048b5a74,53e9b043b7602d9703aa6...",,555037837cea80f95418b43e,1518132001592227720496674411710463357,,
65244,The OCoN Approach for Object-Oriented Distrib...,53e9985fb7602d9702097c47,2000,14,"53e9a0d9b7602d97029c1344,53e9b477b7602d9703f7a...",,555036d27cea80f95415a4df,26082646131920659512612713059652,195631974641111475528774645211214,There are many significant problems related t...
97571,Visual Modeling of Object-oriented Distribute...,53e998e1b7602d970211fed9,2000,13,"53e9a432b7602d9702d4d588,558ab1cfe4b037c087589...",,,728575113832201526013152672795783817446,305201,Software engineering for distributed systems i...
1176215,"""...but can you prove it?""",53e9aa3ab7602d97033a5a30,2000,0,,,555037677cea80f954185b1e,15653361820333136537513589021257028846,"641199,1059743,10909,2936917,2936918,10920,487...",Technical writers are called upon for many dif...
1608878,"""A framework for system specification using ch...",53e9b0bcb7602d9703b30ee8,2000,0,,,539ffba2831432abcb5eeb7f,,,
...,...,...,...,...,...,...,...,...,...,...
5323369,“Did you know this camera tracks your mood?”: ...,603877169e795ea1fb778245,2021,0,,,,,,
5323370,“Did you know this camera tracks your mood?”: ...,603877169e795ea1fb778246,2021,0,,,,,,
5342984,“Relationship Between Learning by Teaching wit...,60703a26e4510cd7c8b7ffb7,2021,-1,,,,,899811531043247812088851485961936,Learning through teaching robots has been show...
5287944,ℓp Subspace Embedding in Input Sparsity Time.,6008039a91e011f078795c97,2021,-1,"53e9a7c1b7602d97031002a4,53e9b20cb7602d9703ca7...",,,1976182428120,,


In [33]:
p_id2emb = {}
for i in trange(len(papers), desc='gen_pid2embed'):
    p_id = str(papers['p_id'].iloc[i])
    abs = strip_non_alphanum(str(papers['abst'].iloc[i]).strip()).split()
    title = strip_non_alphanum(str(papers['title'].iloc[i]).strip()).split()
    lst = title + abs

    vals = []
    for ss in lst:
        try:
            vals.append(model.wv[ss].astype(np.float32))
        except:
            pass
    # print(i, vals)
    embed = np.mean(np.vstack(vals), axis=0) if len(vals)>0 else np.zeros(embed_dim, dtype=np.float32)
    p_id2emb[p_id] = embed
myout(p_id2emb)

gen_pid2embed: 100%|██████████| 4698245/4698245 [2:17:27<00:00, 569.63it/s]  


p_id2emb : len=4698245, dict([53e9b4f4b7602d970401fd57: [-0.39271325 -0.23395708 -0.2243906   0.10632287  0.21385862  0.23266879
 -0.15186583 -0.33077523  0.21875001 -0.29088393 -0.10631448  0.04040582
  0.17744419  0.16959433 -0.32078603 -0.2508009   0.23639062 -0.22303557
 -0.2162575   0.23801255  0.10849212 -0.06621432 -0.39036202  0.16581523
 -0.15381451 -0.12568326  0.03212134 -0.05726942  0.0403302   0.10109102
  0.1808125   0.7205696   0.12661073 -0.02290617 -0.14171718 -0.5993334
 -0.10652961  0.05526267 -0.19240616  0.21450715 -0.05246099 -0.05122315
  0.16006424 -0.6263954   0.18364583 -0.41137794 -0.3814463  -0.17004661
  0.39549986  0.12862144 -0.4160815  -0.38757205 -0.23967588  0.5186388
 -0.12669633 -0.22494778 -0.7041512   0.00097592 -0.00897395  0.13706921
 -0.3475398  -0.5211261  -0.36376375  0.04994494  0.18880534  0.03849642
 -0.04626585  0.18080409 -0.22551279  0.40677658  0.39151528  0.22370294
  0.20009948 -0.13557556 -0.3167955  -0.04922279  0.13877061 -0.192816

## build graph

In [34]:
dataset = 'dblp'
def update_idx(idx, dic, cnt, feats, rel, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        if rel == 0:
            try:
                feats.append(torch.from_numpy(p_id2emb[idx]).to(torch.float32))
            except:
                feats.append(torch.rand(feat_dim).to(torch.float32)-1)
                no_emb += 1
        elif rel == 1:
            feats.append(torch.rand(feat_dim).to(torch.float32)+1)
        elif rel == 2:
            feats.append(torch.rand(feat_dim).to(torch.float32)+2)
        elif rel == 3:
            feats.append(torch.rand(feat_dim).to(torch.float32))
    return dic, cnt, feats, no_emb

In [35]:
start_year, end_year = 2000, 2022
feat_dim = 128

pid2rid, cnt, no_emb = {}, 0, 0
lst, feats = [], []
for i in trange(len(papers)):
    ts = int(papers['year'].iloc[i])
    p_id = str(papers['p_id'].iloc[i])
    refs = papers['refs'].iloc[i]
    
    pid2rid, cnt, feats, no_emb = update_idx(p_id, pid2rid, cnt, feats, 0, feat_dim, no_emb)
    if len(refs)>0:
        rlst = refs.split(',')
        for ref in rlst:
            if ref != '':
                ref = str(ref)
                pid2rid, cnt, feats, no_emb = update_idx(ref, pid2rid, cnt, feats, 0, feat_dim, no_emb)
                lst.append((pid2rid[p_id], pid2rid[ref], 0, ts))
        
feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])
myout(feat, src, tgt, rel, tsp)

100%|██████████| 4698245/4698245 [18:07<00:00, 4318.42it/s] 


feat : shape=torch.Size([5029342, 128])
tensor([[-0.3927, -0.2340, -0.2244,  ..., -0.0855,  0.3149, -0.1202],
        [-0.3428, -0.4148, -0.7265,  ..., -0.8266, -0.0691, -0.1985],
        [-0.8120, -0.2227, -0.8948,  ..., -0.5025, -0.9757, -0.8046],
        ...,
        [-0.1403, -0.0404,  0.1263,  ..., -0.0705,  0.0128, -0.2075],
        [-0.1167, -0.3694,  0.0564,  ...,  0.1396,  0.0013, -0.0733],
        [ 0.0457, -0.0281,  0.1176,  ...,  0.0110, -0.0714, -0.2424]])
src : shape=torch.Size([45281685]), tensor([      0,       0,       0,  ..., 5029340, 5029340, 5029340])
tgt : shape=torch.Size([45281685]), tensor([      1,       2,       3,  ..., 1471200, 1212070, 3161810])
rel : shape=torch.Size([45281685]), tensor([0, 0, 0,  ..., 0, 0, 0])
tsp : shape=torch.Size([45281685]), tensor([2000, 2000, 2000,  ..., 2021, 2021, 2021])


In [36]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat
graph.ndata['raw_nid'] = torch.arange(len(feat))

graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=5029342, num_edges=45281685,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64)})

In [37]:
dgl.save_graphs(f'../data/{dataset}/raw_graph.bin', [graph])

In [38]:
json.dump(pid2rid, open(f'../data/{dataset}/pid2rid.json', 'w'))

## k-core

In [None]:
def core_number(G):
    if nx.number_of_selfloops(G) > 0:
        msg = (
            "Input graph has self loops which is not permitted; "
            "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
        )
        raise ValueError(msg)
    degrees = dict(G.degree())
    # Sort nodes by degree.
    nodes = sorted(degrees, key=degrees.get)
    bin_boundaries = [0]
    curr_degree = 0
    for i, v in enumerate(nodes):
        if degrees[v] > curr_degree:
            bin_boundaries.extend([i] * (degrees[v] - curr_degree))
            curr_degree = degrees[v]
    node_pos = {v: pos for pos, v in enumerate(nodes)}
    # The initial guess for the core number of a node is its degree.
    core = degrees
    nbrs = {v: list(nx.all_neighbors(G, v)) for v in G}
    for v in tqdm(nodes):
        for u in nbrs[v]:
            if core[u] > core[v]:
                nbrs[u].remove(v)
                pos = node_pos[u]
                bin_start = bin_boundaries[core[u]]
                node_pos[u] = bin_start
                node_pos[nodes[bin_start]] = pos
                nodes[bin_start], nodes[pos] = nodes[pos], nodes[bin_start]
                bin_boundaries[core[u]] += 1
                core[u] -= 1
    return core

In [None]:
gpath = f'../data/{dataset}/nx_graph.pkl'
if os.path.exists(gpath):
    print(f'Loading G from {gpath}')
    G = pkl.load(open(gpath, 'rb'))
else:
    graph = dgl.load_graphs(f'../../../62_seal_hints/preprocess/data/{dataset}/raw_graph.bin')[0][0]
    print(dataset, graph.edata['ts'].unique())
    
    train_start, valid_end = 1998, 2022
    ts_eids = graph.filter_edges(lambda x: (x.data['ts']>=train_start) & (x.data['ts']<valid_end))
    ts_graph = dgl.edge_subgraph(graph, ts_eids)
    print(ts_graph)

    g = Data(feat=ts_graph.ndata['feat'], edge_index=torch.stack(ts_graph.edges()), ts=ts_graph.edata['ts'], \
        raw_nid=ts_graph.ndata['raw_nid'])
    G = to_networkx(g, node_attrs=['feat', 'raw_nid'], edge_attrs=['ts'], remove_self_loops=True)
    pkl.dump(G, open(gpath, 'wb'))

In [None]:
cpath = f'../data/{dataset}/core_dict.json'
if os.path.exists(cpath):
    print(f'Loading core_dict from {cpath}')
    core_dict = json.load(open(cpath, 'r'))
    core_dict = {int(kk): vv for kk,vv in core_dict.items()}
else:
    core_dict = core_number(G)
    json.dump(core_dict, open(cpath, 'w'))
g_core = nx.k_core(G, core_number=core_dict, k=k)
core_dg = dgl.from_networkx(g_core, node_attrs=['feat', 'raw_nid'], edge_attrs=['ts'])
print(core_dg, core_dg.edata['ts'].unique())

In [None]:
dgl.save_graphs(f'../data/{dataset}/{k}-core_graph.bin', [core_dg])

## preprocess graph

In [4]:
dataset = 'dblp'
k = 32
core_dg = dgl.load_graphs(f'../data/{dataset}/{k}-core_graph.bin')[0][0]
core_dg, core_dg.edges()

(Graph(num_nodes=22258, num_edges=734134,
       ndata_schemes={'raw_nid': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(128,), dtype=torch.float32)}
       edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)}),
 (tensor([14408, 14408, 14408,  ..., 14407, 14407, 14407]),
  tensor([ 9326, 11374, 11709,  ...,  6860,  6861,  7039])))

In [8]:
data = torch.vstack([core_dg.edges()[0], core_dg.edges()[1], core_dg.edata['ts']]).t()
myout(data)

data : shape=torch.Size([734134, 3])
tensor([[14408,  9326,  2018],
        [14408, 11374,  2018],
        [14408, 11709,  2018],
        ...,
        [14407,  6860,  2018],
        [14407,  6861,  2018],
        [14407,  7039,  2018]])


In [12]:
class Namespace(object):
    '''
    helps referencing object in a dictionary as dict.key instead of dict['key']
    '''
    def __init__(self, adict):
        self.__dict__.update(adict)
cols = Namespace({'source': 0, 'target': 1, 'time': 2})
_, data[:,[cols.source,cols.target]] = data[:,[cols.source,cols.target]].unique(return_inverse = True)
myout(data)

data : shape=torch.Size([734134, 3])
tensor([[14408,  9326,  2018],
        [14408, 11374,  2018],
        [14408, 11709,  2018],
        ...,
        [14407,  6860,  2018],
        [14407,  6861,  2018],
        [14407,  7039,  2018]])


In [17]:
core_dg.number_of_nodes()

22258

In [18]:
num_nodes = int(data[:,[cols.source, cols.target]].max()) + 1
num_nodes

22258

In [None]:
def aggregate_by_time(time_vector,time_win_aggr):
        time_vector = time_vector - time_vector.min()
        time_vector = time_vector // time_win_aggr
        return time_vector
aggr_time = 1
data[:,cols.time] = aggregate_by_time(data[:,cols.time], aggr_time)
myout(data)

In [15]:
max_time = data[:,cols.time].max()
min_time = data[:,cols.time].min()
myout(max_time, min_time)

max_time : shape=torch.Size([]), 21
min_time : shape=torch.Size([]), 0


In [16]:
ids = data[:,cols.source] * num_nodes + data[:,cols.target]
num_diff_edges = ids.unique().size(0)
num_non_existing = float(num_nodes**2 - ids.unique().size(0))
myout(num_diff_edges, num_non_existing)

num_diff_edges = 734134
num_non_existing = 494639915.0


In [None]:
edges = {'idx': idx, 'vals': torch.ones(idx.size(0))}

In [21]:
core_dg.ndata['feat'].shape

torch.Size([22258, 128])

In [22]:
import scipy.sparse as sp
nfeat_sp = sp.coo_matrix(core_dg.ndata['feat'])
myout(nfeat_sp)

nfeat_sp =   (0, 0)	-0.79821414
  (0, 1)	-0.72578645
  (0, 2)	-0.0056170225
  (0, 3)	-0.7852371
  (0, 4)	-0.12048197
  (0, 5)	-0.84436375
  (0, 6)	-0.48123223
  (0, 7)	-0.27705693
  (0, 8)	-0.34126866
  (0, 9)	-0.2611583
  (0, 10)	-0.39385068
  (0, 11)	-0.28523022
  (0, 12)	-0.10930079
  (0, 13)	-0.89142096
  (0, 14)	-0.4609
  (0, 15)	-0.1569494
  (0, 16)	-0.994631
  (0, 17)	-0.48759264
  (0, 18)	-0.88931876
  (0, 19)	-0.35733527
  (0, 20)	-0.58147186
  (0, 21)	-0.3639074
  (0, 22)	-0.49886638
  (0, 23)	-0.054841638
  (0, 24)	-0.8956607
  :	:
  (22257, 103)	-0.1979005
  (22257, 104)	0.113062955
  (22257, 105)	-0.09785657
  (22257, 106)	0.1063065
  (22257, 107)	0.024716679
  (22257, 108)	-0.053807978
  (22257, 109)	0.019868016
  (22257, 110)	-0.16932559
  (22257, 111)	0.2303339
  (22257, 112)	-0.14247952
  (22257, 113)	0.1284604
  (22257, 114)	-0.0451481
  (22257, 115)	0.0075372127
  (22257, 116)	-0.042973183
  (22257, 117)	0.3745397
  (22257, 118)	0.155389
  (22257, 119)	-0.0017298675


In [26]:
coords = np.vstack((nfeat_sp.row, nfeat_sp.col)).transpose()
values = nfeat_sp.data
myout(coords, values)

coords : shape=(2849024, 2)
[[    0     0]
 [    0     1]
 [    0     2]
 ...
 [22257   125]
 [22257   126]
 [22257   127]]
values : shape=(2849024,), [-0.79821414 -0.72578645 -0.00561702 ...  0.03209168 -0.0306733
 -0.273934  ]


In [24]:
myout(nfeat_sp.row, nfeat_sp.col)

_23 : shape=(2849024,), [    0     0     0 ... 22257 22257 22257]
 : shape=(2849024,), [  0   1   2 ... 125 126 127]


##  sort graph

In [34]:
def sort_edges(graph, by=['_SRC', '_TGT']):
    """Sort edges of a dgl.graph by edge_attrs, where '_SRC' and '_TGT' are the index of source nodes and target nodes, respectively.

    Args:
        graph (dgl.graph): _description_
        by (list, optional): edge attributes. Defaults to ['_SRC', '_TGT'].

    Returns:
        _type_: dgl.graph
    """
    df = pd.DataFrame({'_SRC': graph.edges()[0].numpy(), '_TGT': graph.edges()[1].numpy()})
    for name, efeat in graph.edata.items():
        df[name] = efeat.numpy()
    df = df.sort_values(by=by)
    
    new_graph = dgl.graph((torch.tensor(df['_SRC'].tolist()), torch.tensor(df['_TGT'].tolist())), \
        num_nodes = graph.number_of_nodes())
    for name, efeat in graph.edata.items():
        new_graph.edata[name] = torch.tensor(df[name].tolist())
    for name, nfeat in graph.ndata.items():
        new_graph.ndata[name] = nfeat
    return new_graph

In [54]:
dgl_graph = sort_edges(core_dg, by=['ts', '_SRC'])
dgl_graph

Graph(num_nodes=100392, num_edges=1813863,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)})

In [56]:
src_lst = dgl_graph.edges()[0].tolist()
tgt_lst = dgl_graph.edges()[1].tolist()
cur2new, cnt = {}, 0
for src,tgt in tqdm(zip(src_lst, tgt_lst)):
    if src not in cur2new:
        cur2new[src] = cnt
        cnt += 1
    if tgt not in cur2new:
        cur2new[tgt] = cnt
        cnt += 1
myout(cur2new)

1813863it [00:02, 711025.53it/s]

cur2new : len=100392, dict([0: 0, 90539: 1, 90540: 2, 90541: 3, 90542: 4, 90543: 5, ...])





In [58]:
def relabel_nodes(graph, mapping):
    src_lst = graph.edges()[0].tolist()
    tgt_lst = graph.edges()[1].tolist()
    new_src = torch.tensor([mapping[item] for item in src_lst])
    new_tgt = torch.tensor([mapping[item] for item in tgt_lst])
    
    new_graph = dgl.graph((new_src, new_tgt))
    for name, efeat in graph.edata.items():
        new_graph.edata[name] = efeat
    for name, nfeat in graph.ndata.items():
        new_graph.ndata[name] = nfeat
    return new_graph

In [59]:
dgl_graph = relabel_nodes(dgl_graph, cur2new)
dgl_graph, dgl_graph.edges()

(Graph(num_nodes=100392, num_edges=1813863,
       ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)}),
 (tensor([     0,      0,      0,  ..., 100391, 100391, 100391]),
  tensor([    1,     2,     3,  ..., 74823, 11512, 85951])))

In [70]:
dgl.save_graphs('../data/dblp/graph.bin', [dgl_graph])

## build cites

In [60]:
# 1. Calculate the citation for each year
tgt_lst = dgl_graph.edges()[1].tolist()
tsp_lst = dgl_graph.edata['ts'].tolist()
start_year, end_year = dgl_graph.edata['ts'].min().item(), dgl_graph.edata['ts'].max().item()+1

cites = {}
for year in range(start_year, end_year):
    cites[year] = defaultdict(int)

for year, tgt in zip(tsp_lst, tgt_lst):
    cites[year][tgt] += 1
myout(tgt_lst, tsp_lst, start_year, end_year)

tgt_lst : len=1813863, list([1, 2, 3, ..., 74823, 11512, 85951])
tsp_lst : len=1813863, list([2000, 2000, 2000, ..., 2011, 2011, 2011])
start_year = 2000
end_year = 2012


In [61]:
# 2. Calculate the index range corresponding to each year
ts_vals, ts_cuts = np.unique(tsp_lst, return_index=True)
ts_cuts = list(ts_cuts) + [len(tsp_lst)]
ts_infos = np.stack([ts_vals, ts_cuts[0:len(ts_vals)], ts_cuts[1:len(ts_vals)+1]]).transpose()
myout(ts_cuts, ts_vals, ts_infos)

ts_cuts : len=13, list([0, 36391, 85142, ..., 1327762, 1571190, 1813863])
ts_vals : shape=(12,), [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011]
ts_infos : shape=(12, 3)
[[   2000       0   36391]
 [   2001   36391   85142]
 [   2002   85142  149900]
 [   2003  149900  242037]
 [   2004  242037  360033]
 [   2005  360033  509238]
 [   2006  509238  683138]
 [   2007  683138  883000]
 [   2008  883000 1099193]
 [   2009 1099193 1327762]
 [   2010 1327762 1571190]
 [   2011 1571190 1813863]]


In [63]:
# 3. Generate citations of papers for each year in all future years
nid2id = {nid:idd for nid, idd in enumerate(dgl_graph.ndata['raw_nid'].tolist())}
labels = {}
for year in range(start_year, end_year):
    left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
    nids = dgl_graph.edges()[0][left:right].unique().tolist()
    ids = [nid2id[nid] for nid in nids]
    
    pdf = pd.DataFrame({'id': ids, 'nid': nids})
    tbar = trange(year+1, end_year, desc=str(year))
    for yy in tbar:
        cdf = pd.DataFrame({'nid': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
        cdf[str(yy)] = cdf[str(yy)].astype('float32')
        
        pdf = pd.merge(pdf, cdf, how='left', on='nid')
        tbar.set_postfix(year=year, pdf=len(pdf))
    pdf.fillna(0, inplace=True)
    labels[year] = pdf

2000: 100%|██████████| 11/11 [00:00<00:00, 20.59it/s, pdf=3101, year=2000]
2001: 100%|██████████| 10/10 [00:00<00:00, 21.19it/s, pdf=3780, year=2001]
2002: 100%|██████████| 9/9 [00:00<00:00, 17.63it/s, pdf=4571, year=2002]
2003: 100%|██████████| 8/8 [00:00<00:00, 13.70it/s, pdf=5910, year=2003]
2004: 100%|██████████| 7/7 [00:00<00:00, 12.74it/s, pdf=7077, year=2004]
2005: 100%|██████████| 6/6 [00:00<00:00, 12.75it/s, pdf=8205, year=2005]
2006: 100%|██████████| 5/5 [00:00<00:00, 12.23it/s, pdf=8939, year=2006]
2007: 100%|██████████| 4/4 [00:00<00:00, 11.83it/s, pdf=9652, year=2007]
2008: 100%|██████████| 3/3 [00:00<00:00, 11.15it/s, pdf=9791, year=2008]
2009: 100%|██████████| 2/2 [00:00<00:00, 11.07it/s, pdf=9881, year=2009]
2010: 100%|██████████| 1/1 [00:00<00:00, 11.24it/s, pdf=9960, year=2010]
2011: 0it [00:00, ?it/s]


In [64]:
labels[2005]

Unnamed: 0,id,nid,2006,2007,2008,2009,2010,2011
0,58477,406,1.0,2.0,5.0,7.0,3.0,5.0
1,123107,1234,2.0,1.0,1.0,3.0,1.0,0.0
2,123219,1237,1.0,1.0,2.0,3.0,1.0,1.0
3,136892,1413,4.0,7.0,8.0,5.0,2.0,5.0
4,157600,1798,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
8200,1179282,44485,0.0,1.0,1.0,0.0,1.0,0.0
8201,1127884,44486,3.0,3.0,7.0,13.0,8.0,4.0
8202,934053,44487,0.0,0.0,0.0,1.0,0.0,0.0
8203,1179333,44488,3.0,7.0,4.0,10.0,7.0,4.0


In [66]:
dataset = 'dblp'
pkl.dump(labels, open(f'../data/{dataset}/labels.pkl', 'wb'))

In [67]:
# 4. Calculate the cumulative citations and smooth them by taking log
def cumulative_log(df):
    colsn = list(df.columns)
    for i in range(3, len(colsn)):
        df[colsn[i]] += df[colsn[i-1]]
    df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
    return df

labels_cum_log = {}
for year in range(start_year, end_year-2): # 2000, 2020
    labels_cum_log[year] = cumulative_log(labels[year])
labels_cum_log[end_year-2] = labels[end_year-2]

In [68]:
labels_cum_log[2005]

Unnamed: 0,id,nid,2006,2007,2008,2009,2010,2011
0,58477,406,0.693147,1.386294,2.197225,2.772589,2.944439,3.178054
1,123107,1234,1.098612,1.386294,1.609438,2.079442,2.197225,2.197225
2,123219,1237,0.693147,1.098612,1.609438,2.079442,2.197225,2.302585
3,136892,1413,1.609438,2.484907,2.995732,3.218876,3.295837,3.465736
4,157600,1798,0.000000,0.000000,1.098612,1.098612,1.098612,1.098612
...,...,...,...,...,...,...,...,...
8200,1179282,44485,0.000000,0.693147,1.098612,1.098612,1.386294,1.386294
8201,1127884,44486,1.386294,1.945910,2.639057,3.295837,3.555348,3.663562
8202,934053,44487,0.000000,0.000000,0.000000,0.693147,0.693147,0.693147
8203,1179333,44488,1.386294,2.397895,2.708050,3.218876,3.465736,3.583519


In [69]:
pkl.dump(labels_cum_log, open(f'../data/{dataset}/labels_cum_log.pkl', 'wb'))