In [15]:
import pandas as pd
from tqdm import trange, tqdm
import numpy as np
import torch
import sys
from collections import defaultdict
sys.path.append('/mnt/nfs/zhangtl/utils/')
from util import myout
import pickle as pkl
import json

import dgl

## load

In [2]:
papers = pd.read_csv('../../raw_data/soc-sign-bitcoinalpha.csv', header=None)
papers.columns = ['source', 'target', 'rating', 'time']
papers = papers.sort_values(by=['time', 'source', 'target'])
papers

Unnamed: 0,source,target,rating,time
1276,2,402,1,1289192400
4005,10,271,8,1289192400
4004,10,970,8,1289192400
10469,113,54,4,1289192400
10502,54,119,5,1289365200
...,...,...,...,...
19115,906,279,1,1452920400
21812,1202,604,1,1452920400
14393,114,7370,-1,1453006800
5382,15,3451,1,1453438800


In [4]:
myout(papers['time'].unique())

 : shape=(1647,), [1289192400 1289365200 1289451600 ... 1452920400 1453006800 1453438800]


## cut 

In [3]:
cut_step = (papers['time'].max() - papers['time'].min()) / 20
cut_step

8212320.0

In [5]:
papers['new_time'] = papers['time'].map(lambda x: int((x-papers['time'].min())/cut_step))
papers

Unnamed: 0,source,target,rating,time,new_time
1276,2,402,1,1289192400,0
4005,10,271,8,1289192400,0
4004,10,970,8,1289192400,0
10469,113,54,4,1289192400,0
10502,54,119,5,1289365200,0
...,...,...,...,...,...
19115,906,279,1,1452920400,19
21812,1202,604,1,1452920400,19
14393,114,7370,-1,1453006800,19
5382,15,3451,1,1453438800,20


In [6]:
for ii in range(0, 21):
    print(ii, len(np.where(papers['new_time'].to_numpy()==ii)[0]))

0 221
1 1988
2 4071
3 934
4 1316
5 1789
6 2046
7 2081
8 1835
9 2168
10 1326
11 1236
12 783
13 840
14 715
15 474
16 71
17 145
18 72
19 73
20 2


In [7]:
papers = papers.sort_values(by=['new_time', 'source', 'target'])
papers

Unnamed: 0,source,target,rating,time,new_time
883,1,2,1,1291093200,0
469,1,113,2,1291006800,0
471,1,625,1,1293426000,0
470,1,744,1,1293426000,0
1150,2,10,3,1291006800,0
...,...,...,...,...,...
22882,7386,902,-1,1450414800,19
22889,7386,906,1,1450328400,19
24071,7386,7335,1,1450328400,19
5382,15,3451,1,1453438800,20


## build graph

In [8]:
def update_idx(idx, dic, cnt, feats, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        feats.append(torch.rand(feat_dim).to(torch.float32))
    return dic, cnt, feats, no_emb

In [9]:
start_year, end_year = 0, 21
feat_dim = 128

id2nid, cnt, no_emb = {}, 0, 0
lst, feats = [], []

for ii in trange(len(papers)):
    year = int(papers['new_time'].iloc[ii])
    source = int(papers['source'].iloc[ii])
    target = int(papers['target'].iloc[ii])
    weight = int(papers['rating'].iloc[ii])
    
    id2nid, cnt, feats, no_emb = update_idx(source, id2nid, cnt, feats, feat_dim, no_emb)
    id2nid, cnt, feats, no_emb = update_idx(target, id2nid, cnt, feats, feat_dim, no_emb)
    
    lst.append((id2nid[source], id2nid[target], weight, year))

feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])

myout(feat, src, tgt, rel, tsp, id2nid)

  0%|          | 0/24186 [00:00<?, ?it/s] 12%|█▏        | 2883/24186 [00:00<00:00, 28825.63it/s] 27%|██▋       | 6475/24186 [00:00<00:00, 32996.97it/s] 40%|████      | 9775/24186 [00:00<00:00, 31269.34it/s] 53%|█████▎    | 12913/24186 [00:00<00:00, 28759.35it/s] 65%|██████▌   | 15817/24186 [00:00<00:00, 28377.78it/s] 77%|███████▋  | 18671/24186 [00:00<00:00, 28038.34it/s] 91%|█████████ | 21973/24186 [00:00<00:00, 29581.73it/s]100%|██████████| 24186/24186 [00:00<00:00, 30046.44it/s]

feat : shape=torch.Size([3783, 128])
tensor([[0.2189, 0.8208, 0.0903,  ..., 0.1846, 0.8206, 0.6839],
        [0.7837, 0.1621, 0.3150,  ..., 0.1508, 0.5584, 0.2051],
        [0.4212, 0.9971, 0.3687,  ..., 0.8543, 0.8824, 0.5630],
        ...,
        [0.8846, 0.2613, 0.8768,  ..., 0.9984, 0.7125, 0.1079],
        [0.2380, 0.7572, 0.5548,  ..., 0.6955, 0.5748, 0.0461],
        [0.5100, 0.1489, 0.6788,  ..., 0.2810, 0.4729, 0.7517]])
src : shape=torch.Size([24186]), tensor([   0,    0,    0,  ..., 3745,  638, 3782])
tgt : shape=torch.Size([24186]), tensor([   1,    2,    3,  ..., 3692, 3782, 1389])
rel : shape=torch.Size([24186]), tensor([1, 2, 1,  ..., 1, 1, 5])
tsp : shape=torch.Size([24186]), tensor([ 0,  0,  0,  ..., 19, 20, 20])
id2nid : len=3783, dict([1: 0, 2: 1, 113: 2, 625: 3, 744: 4, 10: 5, ...])





In [10]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat

nid2id = {vv: kk for kk, vv in id2nid.items()}
graph.ndata['raw_nid'] = torch.arange(len(feat))

graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=3783, num_edges=24186,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64)})

In [12]:
dataset = 'bca'
dgl.save_graphs(f'../data/{dataset}/graph.bin', [graph])

In [16]:
json.dump(id2nid, open(f'../data/{dataset}/id2nid.json', 'w'))

## gen cites

In [17]:
start_year, end_year = 0, 21
cites = {}
for year in range(start_year, end_year):
    cites[year] = defaultdict(int)

for ii in trange(len(papers)):
    year = int(papers['new_time'].iloc[ii])
    target = int(papers['target'].iloc[ii])
    cites[year][target] += 1
myout(cites[5])

  0%|          | 0/24186 [00:00<?, ?it/s] 29%|██▉       | 6965/24186 [00:00<00:00, 69640.60it/s] 60%|█████▉    | 14500/24186 [00:00<00:00, 72994.52it/s] 90%|█████████ | 21800/24186 [00:00<00:00, 72130.65it/s]100%|██████████| 24186/24186 [00:00<00:00, 70363.62it/s]

 : len=493, dict([35: 4, 117: 29, 259: 6, 276: 4, 301: 2, 519: 6, ...])





In [18]:
tsp = graph.edata['ts']
ts_vals, ts_cuts = np.unique(tsp.numpy(), return_index=True)
ts_cuts = list(ts_cuts) + [len(tsp.numpy())]

num_ts = len(ts_vals)
ts_infos = np.stack([ts_vals, ts_cuts[0:num_ts], ts_cuts[1:num_ts+1]]).transpose()
myout(ts_cuts, ts_vals, ts_infos)

ts_cuts : len=22, list([0, 221, 2209, ..., 24111, 24184, 24186])
ts_vals : shape=(21,), [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
ts_infos : shape=(21, 3)
[[    0     0   221]
 [    1   221  2209]
 [    2  2209  6280]
 [    3  6280  7214]
 [    4  7214  8530]
 [    5  8530 10319]
 [    6 10319 12365]
 [    7 12365 14446]
 [    8 14446 16281]
 [    9 16281 18449]
 [   10 18449 19775]
 [   11 19775 21011]
 [   12 21011 21794]
 [   13 21794 22634]
 [   14 22634 23349]
 [   15 23349 23823]
 [   16 23823 23894]
 [   17 23894 24039]
 [   18 24039 24111]
 [   19 24111 24184]
 [   20 24184 24186]]


In [19]:
labels = {}
nid2id = {v:k for k,v in id2nid.items()}
for year in range(start_year, end_year):
    left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
    nids = graph.edges()[0][left:right].unique().tolist()
    ids = [nid2id[nid] for nid in nids]
    
    pdf = pd.DataFrame({'id': ids, 'nid': nids})
    tbar = trange(year+1, end_year, desc=str(year))
    for yy in tbar:
        cdf = pd.DataFrame({'id': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
        cdf[str(yy)] = cdf[str(yy)].astype('float32')
        
        pdf = pd.merge(pdf, cdf, how='left', on='id')
        tbar.set_postfix(year=year, pdf=len(pdf))
    pdf.fillna(0, inplace=True)
    labels[year] = pdf

0:   0%|          | 0/20 [00:00<?, ?it/s]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=63, year=0]0:   0%|          | 0/20 [00:0

In [20]:
labels[5]

Unnamed: 0,id,nid,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,0,22.0,56.0,26.0,44.0,31.0,18.0,11.0,8.0,10.0,14.0,0.0,0.0,0.0,0.0,0.0
1,2,1,16.0,8.0,4.0,4.0,4.0,7.0,8.0,9.0,6.0,3.0,0.0,1.0,0.0,0.0,0.0
2,744,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,5,34.0,16.0,13.0,18.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,402,9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,2897,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
436,2885,2015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,2201,2016,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,2205,2017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
pkl.dump(labels, open(f'../data/{dataset}/labels.pkl', 'wb'))

## cum log labels

In [22]:
def cumulative_log(df):
    colsn = list(df.columns)
    for i in range(3, len(colsn)):
        df[colsn[i]] += df[colsn[i-1]]
    df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
    return df

labels_cum_log = {}
for year in range(start_year, end_year-1):
    labels_cum_log[year] = cumulative_log(labels[year])
labels_cum_log[end_year-2] = labels[end_year-2]
print(len(labels_cum_log))
labels_cum_log[5]

20


Unnamed: 0,id,nid,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,0,3.135494,4.369448,4.653960,5.003946,5.192957,5.288267,5.342334,5.379898,5.424950,5.484797,5.484797,5.484797,5.484797,5.484797,5.484797
1,2,1,2.833213,3.218876,3.367296,3.496508,3.610918,3.784190,3.951244,4.110874,4.204693,4.248495,4.248495,4.262680,4.262680,4.262680,4.262680
2,744,4,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
3,10,5,3.555348,3.931826,4.158883,4.406719,4.430817,4.442651,4.442651,4.454347,4.465908,4.477337,4.477337,4.477337,4.477337,4.477337,4.477337
4,402,9,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,2897,2013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
436,2885,2015,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
437,2201,2016,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147
438,2205,2017,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [62]:
pkl.dump(labels_cum_log, open(f'../data/{dataset}/labels_cum_log.pkl', 'wb'))