In [1]:
import pandas as pd
from tqdm import trange, tqdm
import numpy as np
import torch
import sys
from collections import defaultdict
sys.path.append('/mnt/nfs/zhangtl/utils/')
from util import myout
import pickle as pkl
import json

import dgl

Using backend: pytorch


## load

In [2]:
papers = pd.read_csv('../../raw_data/soc-sign-bitcoinotc.csv', header=None)
papers.columns = ['source', 'target', 'rating', 'time']
papers = papers.sort_values(by=['time', 'source', 'target'])
papers

Unnamed: 0,source,target,rating,time
0,6,2,4,1.289242e+09
1,6,5,2,1.289242e+09
2,1,15,1,1.289243e+09
3,4,3,7,1.289245e+09
4,13,16,8,1.289254e+09
...,...,...,...,...
35587,4499,1810,1,1.453612e+09
35588,2731,3901,5,1.453679e+09
35589,2731,4897,5,1.453679e+09
35590,13,1128,1,1.453680e+09


In [3]:
myout(papers['time'].unique())

 : shape=(35592,), [1.28924191e+09 1.28924194e+09 1.28924314e+09 ... 1.45367943e+09
 1.45367963e+09 1.45368432e+09]


## cut

In [4]:
cut_step = (papers['time'].max() - papers['time'].min()) / 20
cut_step

8222120.601446008

In [5]:
papers['new_time'] = papers['time'].map(lambda x: int((x-papers['time'].min())/cut_step))
papers = papers.sort_values(by=['new_time', 'source', 'target'])
papers

Unnamed: 0,source,target,rating,time,new_time
254,1,2,8,1.296629e+09,0
28,1,5,4,1.289711e+09,0
56,1,13,3,1.291091e+09,0
2,1,15,1,1.289243e+09,0
52,1,17,9,1.290969e+09,0
...,...,...,...,...,...
35474,5995,35,1,1.446130e+09,19
35494,5996,5949,3,1.447008e+09,19
35520,5999,3878,8,1.449651e+09,19
35530,6000,6002,1,1.450279e+09,19


In [6]:
for ii in range(0, 21):
    print(ii, len(np.where(papers['new_time'].to_numpy()==ii)[0]))

0 321
1 2064
2 4092
3 935
4 1331
5 1800
6 2650
7 3426
8 2902
9 4365
10 3865
11 2422
12 1441
13 1377
14 958
15 621
16 356
17 340
18 200
19 125
20 1


## build graph

In [7]:
def update_idx(idx, dic, cnt, feats, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        feats.append(torch.rand(feat_dim).to(torch.float32))
    return dic, cnt, feats, no_emb

In [8]:
start_year, end_year = 0, 21
feat_dim = 128

id2nid, cnt, no_emb = {}, 0, 0
lst, feats = [], []

for ii in trange(len(papers)):
    year = int(papers['new_time'].iloc[ii])
    source = int(papers['source'].iloc[ii])
    target = int(papers['target'].iloc[ii])
    weight = int(papers['rating'].iloc[ii])
    
    id2nid, cnt, feats, no_emb = update_idx(source, id2nid, cnt, feats, feat_dim, no_emb)
    id2nid, cnt, feats, no_emb = update_idx(target, id2nid, cnt, feats, feat_dim, no_emb)
    
    lst.append((id2nid[source], id2nid[target], weight, year))

feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
rel = torch.tensor([item[2] for item in lst])
tsp = torch.tensor([item[3] for item in lst])

myout(feat, src, tgt, rel, tsp, id2nid)

  0%|          | 0/35592 [00:00<?, ?it/s]  9%|▉         | 3197/35592 [00:00<00:01, 31961.65it/s] 18%|█▊        | 6554/35592 [00:00<00:00, 32905.99it/s] 28%|██▊       | 10090/35592 [00:00<00:00, 34024.83it/s] 38%|███▊      | 13694/35592 [00:00<00:00, 34817.96it/s] 48%|████▊     | 17215/35592 [00:00<00:00, 34958.82it/s] 59%|█████▊    | 20829/35592 [00:00<00:00, 35356.36it/s] 69%|██████▊   | 24441/35592 [00:00<00:00, 35603.47it/s] 79%|███████▊  | 28002/35592 [00:00<00:00, 35375.77it/s] 89%|████████▉ | 31608/35592 [00:00<00:00, 35586.73it/s] 99%|█████████▉| 35167/35592 [00:01<00:00, 35015.73it/s]100%|██████████| 35592/35592 [00:01<00:00, 34896.14it/s]

feat : shape=torch.Size([5881, 128])
tensor([[0.2451, 0.8762, 0.3147,  ..., 0.2771, 0.6220, 0.7476],
        [0.8084, 0.2200, 0.8136,  ..., 0.6358, 0.5316, 0.7204],
        [0.7264, 0.9562, 0.5385,  ..., 0.2514, 0.5723, 0.6432],
        ...,
        [0.9198, 0.2454, 0.8495,  ..., 0.1329, 0.1673, 0.6062],
        [0.8241, 0.6911, 0.2247,  ..., 0.9572, 0.1423, 0.2652],
        [0.4410, 0.3676, 0.2284,  ..., 0.0628, 0.7513, 0.8939]])
src : shape=torch.Size([35592]), tensor([   0,    0,    0,  ..., 5875, 5879,  822])
tgt : shape=torch.Size([35592]), tensor([   1,    2,    3,  ..., 3721, 5880,    3])
rel : shape=torch.Size([35592]), tensor([8, 4, 3,  ..., 8, 1, 2])
tsp : shape=torch.Size([35592]), tensor([ 0,  0,  0,  ..., 19, 19, 20])
id2nid : len=5881, dict([1: 0, 2: 1, 5: 2, 13: 3, 15: 4, 17: 5, ...])





In [9]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat

nid2id = {vv: kk for kk, vv in id2nid.items()}
graph.ndata['raw_nid'] = torch.arange(len(feat))

graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=5881, num_edges=35592,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'rel': Scheme(shape=(), dtype=torch.int64), 'ts': Scheme(shape=(), dtype=torch.int64)})

In [10]:
dataset = 'bco'
dgl.save_graphs(f'../data/{dataset}/graph.bin', [graph])

In [11]:
json.dump(id2nid, open(f'../data/{dataset}/id2nid.json', 'w'))

## gen cites

In [12]:
start_year, end_year = 0, 21
cites = {}
for year in range(start_year, end_year):
    cites[year] = defaultdict(int)

for ii in trange(len(papers)):
    year = int(papers['new_time'].iloc[ii])
    target = int(papers['target'].iloc[ii])
    cites[year][target] += 1
myout(cites[5])

  0%|          | 0/35592 [00:00<?, ?it/s] 16%|█▋        | 5862/35592 [00:00<00:00, 58610.83it/s] 33%|███▎      | 11724/35592 [00:00<00:00, 58349.34it/s] 49%|████▉     | 17560/35592 [00:00<00:00, 56190.13it/s] 70%|██████▉   | 24775/35592 [00:00<00:00, 62336.18it/s] 88%|████████▊ | 31493/35592 [00:00<00:00, 64054.00it/s]100%|██████████| 35592/35592 [00:00<00:00, 63018.24it/s]

 : len=494, dict([304: 8, 540: 4, 823: 5, 1053: 5, 1201: 10, 1316: 7, ...])





In [13]:
tsp = graph.edata['ts']
ts_vals, ts_cuts = np.unique(tsp.numpy(), return_index=True)
ts_cuts = list(ts_cuts) + [len(tsp.numpy())]

num_ts = len(ts_vals)
ts_infos = np.stack([ts_vals, ts_cuts[0:num_ts], ts_cuts[1:num_ts+1]]).transpose()
myout(ts_cuts, ts_vals, ts_infos)

ts_cuts : len=22, list([0, 321, 2385, ..., 35466, 35591, 35592])
ts_vals : shape=(21,), [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
ts_infos : shape=(21, 3)
[[    0     0   321]
 [    1   321  2385]
 [    2  2385  6477]
 [    3  6477  7412]
 [    4  7412  8743]
 [    5  8743 10543]
 [    6 10543 13193]
 [    7 13193 16619]
 [    8 16619 19521]
 [    9 19521 23886]
 [   10 23886 27751]
 [   11 27751 30173]
 [   12 30173 31614]
 [   13 31614 32991]
 [   14 32991 33949]
 [   15 33949 34570]
 [   16 34570 34926]
 [   17 34926 35266]
 [   18 35266 35466]
 [   19 35466 35591]
 [   20 35591 35592]]


In [15]:
labels = {}
nid2id = {v:k for k,v in id2nid.items()}
for year in range(start_year, end_year):
    left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
    nids = graph.edges()[0][left:right].unique().tolist()
    ids = [nid2id[nid] for nid in nids]
    
    pdf = pd.DataFrame({'id': ids, 'nid': nids})
    tbar = trange(year+1, end_year, desc=str(year))
    for yy in tbar:
        cdf = pd.DataFrame({'id': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
        cdf[str(yy)] = cdf[str(yy)].astype('float32')
        
        pdf = pd.merge(pdf, cdf, how='left', on='id')
        tbar.set_postfix(year=year, pdf=len(pdf))
    pdf.fillna(0, inplace=True)
    labels[year] = pdf

0:   0%|          | 0/20 [00:00<?, ?it/s]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:00<?, ?it/s, pdf=90, year=0]0:   0%|          | 0/20 [00:0

In [16]:
labels[5]

Unnamed: 0,id,nid,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,0,16.0,11.0,5.0,5.0,5.0,7.0,8.0,12.0,5.0,2.0,1.0,3.0,0.0,0.0,0.0
1,13,3,33.0,17.0,15.0,27.0,3.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,4.0,2.0,1.0
2,15,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,22,11.0,1.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26,27,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,2086,2064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
437,2088,2065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
438,2119,2066,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
439,2111,2068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pkl.dump(labels, open(f'../data/{dataset}/labels.pkl', 'wb'))

In [18]:
def cumulative_log(df):
    colsn = list(df.columns)
    for i in range(3, len(colsn)):
        df[colsn[i]] += df[colsn[i-1]]
    df.iloc[:, 2:] = np.log(df.iloc[:, 2:] + 1)
    return df

labels_cum_log = {}
for year in range(start_year, end_year-1):
    labels_cum_log[year] = cumulative_log(labels[year])
labels_cum_log[end_year-2] = labels[end_year-2]
print(len(labels_cum_log))
labels_cum_log[5]

20


Unnamed: 0,id,nid,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,1,0,2.833213,3.332205,3.496508,3.637586,3.761200,3.912023,4.060443,4.248495,4.317488,4.343805,4.356709,4.394449,4.394449,4.394449,4.394449
1,13,3,3.526361,3.931826,4.189655,4.532599,4.564348,4.574711,4.574711,4.584968,4.595120,4.605170,4.624973,4.634729,4.672829,4.691348,4.700480
2,15,4,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612,1.098612
3,4,22,2.484907,2.564949,2.564949,2.772589,2.833213,2.890372,2.890372,2.890372,2.890372,2.890372,2.890372,2.890372,2.890372,2.890372,2.890372
4,26,27,1.386294,1.386294,1.386294,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,2086,2064,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
437,2088,2065,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
438,2119,2066,1.609438,1.609438,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759,1.791759
439,2111,2068,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
pkl.dump(labels_cum_log, open(f'../data/{dataset}/labels_cum_log.pkl', 'wb'))