In [1]:
from tqdm import tqdm, trange
import json
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import torch
import os
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.models.word2vec import Word2Vec
import random
from scipy.sparse import csr_matrix
import dgl

import pickle as pkl
import sys
sys.path.append('/mnt/nfs/zhangtl/utils')
from util import myout

Using backend: pytorch


## load

In [2]:
dataset = 'acm'
df = pd.read_csv(f'../../../01_process/data/{dataset}/Papers.csv') # 5353904
df.dropna(subset=['p_id', 'year'], inplace=True)
df.fillna({'refs': '', 'a_ids': '', 'v_id': '', 'fids': '', 'abst': ''}, inplace=True)
df

Unnamed: 0,title,p_id,year,refs,anames,vname,abst
0,Projective transformations in two complex vari...,716944,1936,,J. W. Hahn,Projective transformations in two complex vari...,
1,A symbolic analysis of relay and switching cir...,837643,1938,,Claude Elwood Shannon,A symbolic analysis of relay and switching cir...,
2,A linear algebraic theory of complexes,683819,1941,,Lloyd Wayne Johnson,A linear algebraic theory of complexes,
3,First draft of a report on the EDVAC,840606,1945,,John von Newmann,First draft of a report on the EDVAC,
4,The embedding of products and joins of complex...,683918,1947,,"Louis Bryant Tuckerman,III",The embedding of products and joins of complex...,
...,...,...,...,...,...,...,...
2376588,m -ary partitions with no gaps,2381976,2016,2381731,"George E. Andrews,Aviezri S. Fraenkel,James A....",Discrete Mathematics,"In a recent work, the authors provided the fir..."
2376589,n -consistent density estimation in semiparame...,2383644,2016,2381731,"Shuo Li,Yundong Tu",Computational Statistics & Data Analysis,The authors propose an estimator for the densi...
2376590,α-shapes for local feature detection,2382812,2016,2381731,"Christos Varytimidis,Konstantinos Rapantzikos,...",Pattern Recognition,Local image features are routinely used in sta...
2376591,ε constrained differential evolution with pre-...,2382796,2016,2381731,"Wenchao Yi,Xinyu Li,Liang Gao,Yinzhi Zhou,Jida...",Expert Systems with Applications: An Internati...,An improved algorithm is proposed for constrai...


In [3]:
start_year, end_year = 2000, 2017
drop_id = df[(df.year < start_year) | (df.year >= end_year)].index
print(f'Drop {len(drop_id)} rows') # 559749
papers = df.drop(drop_id) # 3547589
papers

Drop 691496 rows


Unnamed: 0,title,p_id,year,refs,anames,vname,abst
691496,"""'Andreas, Rauber'? Conference Pages Are over ...",588797,2000,,"Andreas Rauber,Harald Bina",DEXA '00 Proceedings of the 11th International...,With the massive advance of electronic documen...
691497,"""...but can you prove it?""",344190,2000,,Raymond E. Floyd,IPCC/SIGDOC '00 Proceedings of IEEE profession...,Technical writers are called upon for many dif...
691498,"""Bloat"": the objective and subject dimensions",446619,2000,2581973184442135000,Joanna McGrenere,CHI '00 Extended Abstracts on Human Factors in...,"""Bloat"", a term that has existed in the techni..."
691499,"""Boosting'' a Positive-Data-Only Learner",466731,2000,,Andrew R. Mitchell,ICML '00 Proceedings of the Seventeenth Intern...,
691500,"""Cool low power"" 1GHz multi-port register file...",308321,2000,2528812135000,"R. V. Joshi,W. Hwang,S. C. Wilson,C. T. Chuang",ISLPED '00 Proceedings of the 2000 internation...,This paper describes power analysis at sub-zer...
...,...,...,...,...,...,...,...
2376588,m -ary partitions with no gaps,2381976,2016,2381731,"George E. Andrews,Aviezri S. Fraenkel,James A....",Discrete Mathematics,"In a recent work, the authors provided the fir..."
2376589,n -consistent density estimation in semiparame...,2383644,2016,2381731,"Shuo Li,Yundong Tu",Computational Statistics & Data Analysis,The authors propose an estimator for the densi...
2376590,α-shapes for local feature detection,2382812,2016,2381731,"Christos Varytimidis,Konstantinos Rapantzikos,...",Pattern Recognition,Local image features are routinely used in sta...
2376591,ε constrained differential evolution with pre-...,2382796,2016,2381731,"Wenchao Yi,Xinyu Li,Liang Gao,Yinzhi Zhou,Jida...",Expert Systems with Applications: An Internati...,An improved algorithm is proposed for constrai...


## build graph

In [11]:
dataset = 'acm'
p_id2emb = pkl.load(open(f'../../../01_process/data_papers/{dataset}/p_id2emb.pkl', 'rb'))
myout(p_id2emb)
def update_idx(idx, dic, cnt, feats, rel, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        if rel == 0:
            try:
                feats.append(torch.from_numpy(p_id2emb[idx]).to(torch.float32))
            except:
                feats.append(torch.rand(feat_dim).to(torch.float32)-1)
                no_emb += 1
        elif rel == 1:
            feats.append(torch.rand(feat_dim).to(torch.float32)+1)
        elif rel == 2:
            feats.append(torch.rand(feat_dim).to(torch.float32)+2)
        elif rel == 3:
            feats.append(torch.rand(feat_dim).to(torch.float32))
    return dic, cnt, feats, no_emb

p_id2emb : len=2376584, dict([716944: [-0.1282777   0.00240916  0.26668826  0.08978894  0.02635199 -0.09615058
 -0.12949519 -0.10491749  0.08488696  0.19583608  0.1557116   0.12255639
  0.02203467 -0.03785538 -0.1569959   0.04092474 -0.02563533  0.24861842
 -0.21112294 -0.2027284   0.26484483 -0.01800103 -0.0080198  -0.1617472
 -0.07168785  0.17009293 -0.13218856  0.02995733 -0.16780156  0.11179141
 -0.1347297   0.1215682  -0.09124589  0.00165576 -0.10124022  0.15212093
  0.18311232  0.17506236 -0.01218926  0.10354797  0.06927218  0.18704677
 -0.12665646 -0.01577885  0.24689744  0.01915004 -0.13572101 -0.1046448
 -0.28096476  0.10962244  0.2807999  -0.16602735  0.15189801  0.26751837
  0.20711492 -0.01239756  0.26561913 -0.14206833 -0.10044965  0.09344733
  0.00841188 -0.31207678  0.3116748   0.0933522  -0.12571825  0.00942059
 -0.00977049 -0.39590684 -0.10471779 -0.39753333  0.04896852  0.114576
 -0.11572983 -0.26378155 -0.2568041  -0.18115757  0.1754563  -0.15294732
 -0.02564991 -0.0

In [12]:
start_year, end_year = 2000, 2017
feat_dim = 128

id2nid, cnt, no_emb = {}, 0, 0
lst, feats = [], []
for i in trange(len(papers)):
    ts = int(papers['year'].iloc[i])
    p_id = int(papers['p_id'].iloc[i])
    refs = papers['refs'].iloc[i]
    
    id2nid, cnt, feats, no_emb = update_idx(p_id, id2nid, cnt, feats, 0, feat_dim, no_emb)
    if len(refs)>0:
        rlst = refs.split(',')
        for ref in rlst:
            if ref != '':
                ref = int(ref)
                id2nid, cnt, feats, no_emb = update_idx(ref, id2nid, cnt, feats, 0, feat_dim, no_emb)
                lst.append((id2nid[p_id], id2nid[ref], 0, ts))
        
feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])
myout(feat, src, tgt, rel, tsp)

  0%|          | 0/1685097 [00:00<?, ?it/s]  0%|          | 775/1685097 [00:00<03:38, 7715.64it/s]  0%|          | 1547/1685097 [00:00<03:38, 7717.67it/s]  0%|          | 3244/1685097 [00:00<02:20, 11936.31it/s]  0%|          | 5455/1685097 [00:00<01:45, 15946.79it/s]  0%|          | 7330/1685097 [00:00<01:38, 16954.69it/s]  1%|          | 9349/1685097 [00:00<01:32, 18052.07it/s]  1%|          | 11818/1685097 [00:00<01:22, 20219.48it/s]  1%|          | 14766/1685097 [00:00<01:12, 23165.86it/s]  1%|          | 17488/1685097 [00:00<01:08, 24431.59it/s]  1%|          | 19979/1685097 [00:01<01:07, 24577.81it/s]  1%|▏         | 22554/1685097 [00:01<01:06, 24934.23it/s]  1%|▏         | 25215/1685097 [00:01<01:05, 25440.79it/s]  2%|▏         | 27760/1685097 [00:01<01:09, 23768.36it/s]  2%|▏         | 30739/1685097 [00:01<01:04, 25499.35it/s]  2%|▏         | 33313/1685097 [00:01<01:04, 25463.87it/s]  2%|▏         | 35923/1685097 [00:01<01:04, 25649.11it/s]  2%|▏         | 3892

feat : shape=torch.Size([1911467, 128])
tensor([[-0.0930, -0.0533,  0.0899,  ...,  0.0846, -0.1764,  0.0554],
        [-0.0878, -0.1269,  0.0976,  ...,  0.1330, -0.1693,  0.0564],
        [-0.0856, -0.1327,  0.1453,  ...,  0.0682, -0.1142, -0.0091],
        ...,
        [-0.0024, -0.0801,  0.1826,  ...,  0.0349, -0.1280,  0.0271],
        [-0.1185, -0.0426,  0.1020,  ..., -0.0362, -0.1182,  0.0422],
        [-0.1131, -0.2047,  0.1509,  ...,  0.1121, -0.1424,  0.0003]])
src : shape=torch.Size([8606074]), tensor([      2,       2,       2,  ..., 1911464, 1911465, 1911466])
tgt : shape=torch.Size([8606074]), tensor([  3,   4,   5,  ..., 445, 445, 445])
rel : shape=torch.Size([8606074]), tensor([0, 0, 0,  ..., 0, 0, 0])
tsp : shape=torch.Size([8606074]), tensor([2000, 2000, 2000,  ..., 2016, 2016, 2016])


In [13]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat

nid2id = {vv: kk for kk, vv in id2nid.items()}
graph.ndata['raw_nid'] = torch.arange(len(feat))

graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=1911467, num_edges=8606074,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64)})

In [14]:
dgl.save_graphs(f'../data/{dataset}/graph.bin', [graph])

In [15]:
json.dump(id2nid, open(f'../data/{dataset}/id2nid.json', 'w'))

## gen cites

In [16]:
cites = {}
print(start_year, end_year)
for year in range(start_year, end_year):
    cites[year] = defaultdict(int)

papers.fillna({'refs': ''}, inplace=True)
for i in trange(len(papers)):
    year = int(papers['year'].iloc[i])
    refs = papers['refs'].iloc[i]
    if len(refs)>0:
        rlst = refs.split(',')
        rlst = [int(item) for item in rlst]
        for ref in rlst:
            if ref != '':
                cites[year][ref] += 1

2000 2017


  0%|          | 0/1685097 [00:00<?, ?it/s]  0%|          | 4761/1685097 [00:00<00:35, 47597.34it/s]  1%|          | 10079/1685097 [00:00<00:32, 50876.93it/s]  1%|          | 16744/1685097 [00:00<00:28, 58073.48it/s]  1%|▏         | 22877/1685097 [00:00<00:28, 59357.04it/s]  2%|▏         | 28813/1685097 [00:00<00:30, 54352.52it/s]  2%|▏         | 34314/1685097 [00:00<00:30, 53818.34it/s]  2%|▏         | 40141/1685097 [00:00<00:29, 55200.51it/s]  3%|▎         | 45696/1685097 [00:00<00:29, 54747.69it/s]  3%|▎         | 51631/1685097 [00:00<00:29, 56140.76it/s]  3%|▎         | 57616/1685097 [00:01<00:28, 57259.89it/s]  4%|▍         | 63359/1685097 [00:01<00:28, 56594.94it/s]  4%|▍         | 69252/1685097 [00:01<00:28, 57290.51it/s]  4%|▍         | 75008/1685097 [00:01<00:28, 57370.26it/s]  5%|▍         | 81044/1685097 [00:01<00:27, 58263.68it/s]  5%|▌         | 86877/1685097 [00:01<00:28, 56154.18it/s]  5%|▌         | 92512/1685097 [00:01<00:28, 55561.01it/s]  6%|▌       

In [17]:
tsp = graph.edata['ts']
ts_vals, ts_cuts = np.unique(tsp.numpy(), return_index=True)
ts_cuts = list(ts_cuts) + [len(tsp.numpy())]

num_ts = len(ts_vals)
ts_infos = np.stack([ts_vals, ts_cuts[0:num_ts], ts_cuts[1:num_ts+1]]).transpose()
myout(ts_cuts, ts_vals, ts_infos)

ts_cuts : len=18, list([0, 185995, 416281, ..., 8508576, 8601989, 8606074])
ts_vals : shape=(17,), [2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016]
ts_infos : shape=(17, 3)
[[   2000       0  185995]
 [   2001  185995  416281]
 [   2002  416281  738496]
 [   2003  738496 1039514]
 [   2004 1039514 1352971]
 [   2005 1352971 1904924]
 [   2006 1904924 2493378]
 [   2007 2493378 3134492]
 [   2008 3134492 3795993]
 [   2009 3795993 4708222]
 [   2010 4708222 5561958]
 [   2011 5561958 6367322]
 [   2012 6367322 7201629]
 [   2013 7201629 7913435]
 [   2014 7913435 8508576]
 [   2015 8508576 8601989]
 [   2016 8601989 8606074]]


In [18]:
labels = {}
nid2id = {v:k for k,v in id2nid.items()}
for year in range(start_year, end_year):
    left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
    nids = graph.edges()[0][left:right].unique().tolist()
    ids = [nid2id[nid] for nid in nids]
    
    pdf = pd.DataFrame({'id': ids, 'nid': nids})
    tbar = trange(year+1, end_year, desc=str(year))
    for yy in tbar:
        cdf = pd.DataFrame({'id': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
        cdf[str(yy)] = cdf[str(yy)].astype('float32')
        
        pdf = pd.merge(pdf, cdf, how='left', on='id')
        tbar.set_postfix(year=year, pdf=len(pdf))
    pdf.fillna(0, inplace=True)
    labels[year] = pdf
    

2000:   0%|          | 0/16 [00:00<?, ?it/s]2000:   0%|          | 0/16 [00:00<?, ?it/s, pdf=20415, year=2000]2000:   0%|          | 0/16 [00:00<?, ?it/s, pdf=20415, year=2000]2000:  12%|█▎        | 2/16 [00:00<00:00, 17.54it/s, pdf=20415, year=2000]2000:  12%|█▎        | 2/16 [00:00<00:00, 17.54it/s, pdf=20415, year=2000]2000:  12%|█▎        | 2/16 [00:00<00:00, 17.54it/s, pdf=20415, year=2000]2000:  25%|██▌       | 4/16 [00:00<00:00, 17.52it/s, pdf=20415, year=2000]2000:  25%|██▌       | 4/16 [00:00<00:00, 17.52it/s, pdf=20415, year=2000]2000:  25%|██▌       | 4/16 [00:00<00:00, 17.52it/s, pdf=20415, year=2000]2000:  38%|███▊      | 6/16 [00:00<00:00, 13.64it/s, pdf=20415, year=2000]2000:  38%|███▊      | 6/16 [00:00<00:00, 13.64it/s, pdf=20415, year=2000]2000:  38%|███▊      | 6/16 [00:00<00:00, 13.64it/s, pdf=20415, year=2000]2000:  50%|█████     | 8/16 [00:00<00:00, 11.70it/s, pdf=20415, year=2000]2000:  50%|█████     | 8/16 [00:00<00:00, 11.70it/s, pdf=20415, year=20

In [19]:
labels[2005]

Unnamed: 0,id,nid,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,802791,10577,1.0,1.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
1,805887,10578,3.0,2.0,4.0,1.0,3.0,0.0,2.0,2.0,1.0,0.0,0.0
2,818370,10579,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,823907,10580,3.0,8.0,5.0,6.0,3.0,5.0,3.0,2.0,1.0,0.0,0.0
4,958585,11901,0.0,2.0,4.0,11.0,3.0,3.0,7.0,1.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64503,1721364,618022,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64504,1738897,618023,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64505,1717800,618025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64506,1219539,618026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
pkl.dump(labels, open(f'../data/{dataset}/labels.pkl', 'wb'))

## cum log labels

In [21]:
def cumulative_log(df):
    colsn = list(df.columns)
    for i in range(3, len(colsn)):
        df[colsn[i]] += df[colsn[i-1]]
    df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
    return df

labels_cum_log = {}
for year in range(start_year, end_year-2): # 2000, 2020
    labels_cum_log[year] = cumulative_log(labels[year])
labels_cum_log[end_year-2] = labels[end_year-2]
print(len(labels_cum_log))
labels_cum_log[2005]

16


Unnamed: 0,id,nid,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,802791,10577,0.693147,1.098612,1.386294,1.791759,1.945910,2.197225,2.197225,2.197225,2.197225,2.197225,2.197225
1,805887,10578,1.386294,1.791759,2.302585,2.397895,2.639057,2.639057,2.772589,2.890372,2.944439,2.944439,2.944439
2,818370,10579,0.000000,0.000000,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.609438,1.609438,1.609438
3,823907,10580,1.386294,2.484907,2.833213,3.135494,3.258096,3.433987,3.526361,3.583519,3.610918,3.610918,3.610918
4,958585,11901,0.000000,1.098612,1.945910,2.890372,3.044523,3.178054,3.433987,3.465736,3.583519,3.583519,3.583519
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64503,1721364,618022,0.000000,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
64504,1738897,618023,0.693147,1.098612,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294
64505,1717800,618025,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
64506,1219539,618026,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
pkl.dump(labels_cum_log, open(f'../data/{dataset}/labels_cum_log.pkl', 'wb'))