In [1]:
import pandas as pd
from tqdm import trange, tqdm
import numpy as np
import torch
import sys
from collections import defaultdict
sys.path.append('/mnt/nfs/zhangtl/utils/')
from util import myout
import pickle as pkl
import json

import dgl

## load

In [4]:
edgelist = pd.read_csv('../../raw_data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')
# papers.columns = ['source', 'target', 'rating', 'time']
# papers = papers.sort_values(by=['time', 'source', 'target'])
edgelist

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206
...,...,...
234350,158365409,157930723
234351,188708874,188708879
234352,157659064,157659046
234353,87414554,106877725


In [48]:
myout(edgelist['txId2'].unique())

 : shape=(148447,), [  5530458 232438397 230459870 ... 161422725 188708879 158589457]


In [14]:
tmp = set(edgelist['txId1'].unique().tolist()) | set(edgelist['txId2'].unique().tolist()) 
myout(tmp)

tmp : len=203769, set([395313180, 339738664, 48234540, ..., 87031790, 87031793, 87031794])


In [3]:
features = pd.read_csv('../../raw_data/elliptic_bitcoin_dataset/elliptic_txs_features.csv', header=None)
# papers.columns = ['source', 'target', 'rating', 'time']
# papers = papers.sort_values(by=['time', 'source', 'target'])
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.293750,0.178136,0.179117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,173077460,49,-0.145771,-0.163752,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.135803,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203765,158577750,49,-0.165920,-0.123607,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.156418,...,0.162722,0.010822,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984
203766,158375402,49,-0.172014,-0.078182,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163626,...,1.261246,1.985050,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792
203767,158654197,49,-0.172842,-0.176622,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.163501,...,-0.397749,-0.411776,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399


## set time

In [8]:
txid2ts = features.set_index(0).to_dict()[1]
myout(txid2ts)

txid2ts : len=203769, dict([230425980: 1, 5530458: 1, 232022460: 1, 232438397: 1, 230460314: 1, 230459870: 1, ...])


In [11]:
tsp = []
lst = []
for ii in trange(len(edgelist)):
    source = int(edgelist['txId1'].iloc[ii])
    target = int(edgelist['txId2'].iloc[ii])
    ts1, ts2 = txid2ts[source], txid2ts[target]
    
    if ts1 == ts2:
        tsp.append(ts1)
    else:
        lst.append(ii)
myout(tsp, lst)

100%|██████████| 234355/234355 [00:03<00:00, 68603.11it/s]

tsp : len=234355, list([1, 1, 1, ..., 49, 49, 49])
lst : len=0, []





In [12]:
edgelist['time'] = tsp
edgelist

Unnamed: 0,txId1,txId2,time
0,230425980,5530458,1
1,232022460,232438397,1
2,230460314,230459870,1
3,230333930,230595899,1
4,232013274,232029206,1
...,...,...,...
234350,158365409,157930723,49
234351,188708874,188708879,49
234352,157659064,157659046,49
234353,87414554,106877725,49


## build graph

In [15]:
txid2idx = {sub:idx for idx,sub in enumerate(features[0].unique().tolist())}
tx_feats = features.iloc[:, 2:].values
myout(txid2idx, tx_feats)

txid2idx : len=203769, dict([230425980: 0, 5530458: 1, 232022460: 2, 232438397: 3, 230460314: 4, 230459870: 5, ...])
tx_feats : shape=(203769, 165)
[[-0.17146929 -0.18466755 -1.2013688  ... -0.09752359 -0.12061341
  -0.11979246]
 [-0.17148421 -0.18466755 -1.2013688  ... -0.09752359 -0.12061341
  -0.11979246]
 [-0.17210694 -0.18466755 -1.2013688  ... -0.18367056 -0.12061341
  -0.11979246]
 ...
 [-0.17201373 -0.07818172  1.0186019  ... -0.09752359 -0.12061341
  -0.11979246]
 [-0.17284167 -0.17662248  1.0186019  ... -0.14059708  1.51969962
   1.52139948]
 [-0.01203726 -0.13227626  0.46360923 ... -0.14059708  1.51969962
   1.52139948]]


In [17]:
def update_idx(idx, dic, cnt, feats, feat_dim, no_emb):
    if idx not in dic:
        dic[idx] = cnt
        cnt += 1
        if idx in txid2idx:
            feats.append(torch.tensor(tx_feats[txid2idx[idx]]).to(torch.float32))
        else:
            feats.append(torch.rand(feat_dim).to(torch.float32))
            no_emb += 1
    return dic, cnt, feats, no_emb

In [19]:
feat_dim = 165
index = {}

id2nid, cnt, no_emb = {}, 0, 0
lst, feats = [], []

for ii in trange(len(edgelist)):
    year = int(edgelist['time'].iloc[ii])
    source = int(edgelist['txId1'].iloc[ii])
    target = int(edgelist['txId2'].iloc[ii])
    
    id2nid, cnt, feats, no_emb = update_idx(source, id2nid, cnt, feats, feat_dim, no_emb)
    id2nid, cnt, feats, no_emb = update_idx(target, id2nid, cnt, feats, feat_dim, no_emb)
    
    lst.append((id2nid[source], id2nid[target], year))

feat = torch.stack(feats)
src = torch.tensor([item[0] for item in lst])
tgt = torch.tensor([item[1] for item in lst])
tsp = torch.tensor([item[2] for item in lst])

myout(feat, src, tgt, id2nid, id2nid, no_emb)

100%|██████████| 234355/234355 [00:08<00:00, 26226.23it/s]


feat : shape=torch.Size([203769, 165])
tensor([[-0.1715, -0.1847, -1.2014,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1715, -0.1847, -1.2014,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1721, -0.1847, -1.2014,  ..., -0.1837, -0.1206, -0.1198],
        ...,
        [-0.1730, -0.0768,  1.0186,  ..., -0.1406,  1.5197,  1.5214],
        [-0.1722,  0.5598,  1.0186,  ..., -0.0975, -0.1206, -0.1198],
        [-0.1729, -0.1692,  1.0186,  ..., -0.0975, -0.1206, -0.1198]])
src : shape=torch.Size([234355]), tensor([     0,      2,      4,  ..., 202782, 201480, 202669])
tgt : shape=torch.Size([234355]), tensor([     1,      3,      5,  ..., 202269, 201368, 202139])
id2nid : len=203769, dict([230425980: 0, 5530458: 1, 232022460: 2, 232438397: 3, 230460314: 4, 230459870: 5, ...])
id2nid : len=203769, dict([230425980: 0, 5530458: 1, 232022460: 2, 232438397: 3, 230460314: 4, 230459870: 5, ...])
start_year = 0


In [20]:
graph = dgl.graph((src, tgt), num_nodes=len(feat))
graph.ndata['feat'] = feat

nid2id = {vv: kk for kk, vv in id2nid.items()}
graph.ndata['raw_nid'] = torch.arange(len(feat))

# graph.edata['rel'] = rel
graph.edata['ts'] = tsp
graph

Graph(num_nodes=203769, num_edges=234355,
      ndata_schemes={'feat': Scheme(shape=(165,), dtype=torch.float32), 'raw_nid': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'ts': Scheme(shape=(), dtype=torch.int64)})

## gen cites

In [30]:
start_year, end_year = 1, 50
cites = {}
for year in range(start_year, end_year):
    cites[year] = defaultdict(int)

for ii in trange(len(edgelist)):
    year = int(edgelist['time'].iloc[ii])
    target = int(edgelist['txId2'].iloc[ii])
    cites[year][target] += 1
myout(cites[5])

100%|██████████| 234355/234355 [00:03<00:00, 63926.16it/s]

 : len=5128, dict([226703245: 12, 224991509: 1, 134009152: 2, 225297773: 1, 224744873: 1, 225703422: 10, ...])





In [31]:
tsp = graph.edata['ts']
ts_vals, ts_cuts = np.unique(tsp.numpy(), return_index=True)
ts_cuts = list(ts_cuts) + [len(tsp.numpy())]

num_ts = len(ts_vals)
ts_infos = np.stack([ts_vals, ts_cuts[0:num_ts], ts_cuts[1:num_ts+1]]).transpose()
myout(ts_cuts, ts_vals, ts_infos)

ts_cuts : len=50, list([0, 9164, 14405, ..., 228484, 231768, 234355])
ts_vals : shape=(49,), [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49]
ts_infos : shape=(49, 3)
[[     1      0   9164]
 [     2   9164  14405]
 [     3  14405  22721]
 [     4  22721  30901]
 [     5  30901  39524]
 [     6  39524  44766]
 [     7  44766  52019]
 [     8  52019  57205]
 [     9  57205  63144]
 [    10  63144  71732]
 [    11  71732  76388]
 [    12  76388  78601]
 [    13  78601  83428]
 [    14  83428  85506]
 [    15  85506  89329]
 [    16  89329  92449]
 [    17  92449  96099]
 [    18  96099  98214]
 [    19  98214 102052]
 [    20 102052 106807]
 [    21 106807 110766]
 [    22 110766 117780]
 [    23 117780 122364]
 [    24 122364 127488]
 [    25 127488 130107]
 [    26 130107 132797]
 [    27 132797 133965]
 [    28 133965 135682]
 [    29 135682 140223]
 [    30 140223 142784]
 [    31 14

In [49]:
labels = {}
nid2id = {v:k for k,v in id2nid.items()}
for year in range(start_year, end_year):
    left, right = ts_infos[np.where(ts_infos[:, 0]==year)[0][0], 1:]
    nids = graph.edges()[0][left:right].unique().tolist()
    ids = [nid2id[nid] for nid in nids]
    
    pdf = pd.DataFrame({'id': ids, 'nid': nids})
    # tbar = trange(year+1, end_year, desc=str(year))
    tbar = trange(year, end_year, desc=str(year))
    for yy in tbar:
        cdf = pd.DataFrame({'id': list(cites[yy].keys()), str(yy): list(cites[yy].values())})
        cdf[str(yy)] = cdf[str(yy)].astype('float32')
        
        pdf = pd.merge(pdf, cdf, how='left', on='id')
        tbar.set_postfix(year=year, pdf=len(pdf))
    pdf.fillna(0, inplace=True)
    labels[year] = pdf

1: 100%|██████████| 49/49 [00:00<00:00, 127.92it/s, pdf=6672, year=1]
2: 100%|██████████| 48/48 [00:00<00:00, 152.84it/s, pdf=3716, year=2]
3: 100%|██████████| 47/47 [00:00<00:00, 114.58it/s, pdf=5859, year=3]
4: 100%|██████████| 46/46 [00:00<00:00, 141.56it/s, pdf=4815, year=4]
5: 100%|██████████| 45/45 [00:00<00:00, 136.69it/s, pdf=5760, year=5]
6: 100%|██████████| 44/44 [00:00<00:00, 146.37it/s, pdf=3809, year=6]
7: 100%|██████████| 43/43 [00:00<00:00, 154.19it/s, pdf=5015, year=7]
8: 100%|██████████| 42/42 [00:00<00:00, 128.95it/s, pdf=3696, year=8]
9: 100%|██████████| 41/41 [00:00<00:00, 139.43it/s, pdf=4206, year=9]
10: 100%|██████████| 40/40 [00:00<00:00, 138.80it/s, pdf=5537, year=10]
11: 100%|██████████| 39/39 [00:00<00:00, 176.03it/s, pdf=3530, year=11]
12: 100%|██████████| 38/38 [00:00<00:00, 196.91it/s, pdf=1576, year=12]
13: 100%|██████████| 37/37 [00:00<00:00, 160.90it/s, pdf=3841, year=13]
14: 100%|██████████| 36/36 [00:00<00:00, 171.54it/s, pdf=1716, year=14]
15: 100%|█

In [51]:
labels[5].describe()

Unnamed: 0,id,nid,5,6,7,8,9,10,11,12,...,40,41,42,43,44,45,46,47,48,49
count,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,...,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0
mean,203162800.0,28035.069097,1.081076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,47421770.0,1929.791441,1.954852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,738120.0,24738.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223894300.0,26371.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,224698600.0,28010.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,225059800.0,29662.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,226729200.0,31539.0,63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
cites[45][225341789]

0