In [1]:
import numpy as np

import cudf
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
import torchmetrics.functional as MF
from torchmetrics import AUROC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import warnings
warnings.filterwarnings("ignore")
from dgl_model import SAGE
from read_data import process_ellipitc
import dgl
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
import argparse


def evaluate(model, graph, dataloader):
    model.eval()
    ys = []
    y_hats = []
    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
        with torch.no_grad():
            x = blocks[0].srcdata['feat']
            ys.append(blocks[-1].dstdata['label'])
            y_hats.append(model(blocks, x))
    #return MF.accuracy(torch.cat(y_hats), torch.cat(ys))
    auroc = AUROC(num_classes=2)
    return auroc (torch.cat(y_hats), torch.cat(ys))

def layerwise_infer(device, graph, nid, model, batch_size):
    model.eval()
    with torch.no_grad():
        pred = model.inference(graph, device, batch_size) # pred in buffer_device
        pred = pred[nid]
        label = graph.ndata['label'][nid].to(pred.device)
        #return MF.accuracy(pred, label)
        auroc = AUROC(num_classes=2)
        return auroc(torch.cat(y_hats), torch.cat(ys))


def train(args, device, g, train_idx, val_idx, model):
    # create sampler & dataloader
    sampler = NeighborSampler([10, 10, 10],  # fanout for [layer-0, layer-1, layer-2]
                              prefetch_node_feats=['feat'],
                              prefetch_labels=['label'])
    train_dataloader = DataLoader(g, train_idx, sampler, device=device,
                                  batch_size=1024, shuffle=True,
                                  drop_last=False, num_workers=0)

    val_dataloader = DataLoader(g, val_idx, sampler, device=device,
                                batch_size=1024, shuffle=True,
                                drop_last=False, num_workers=0)

    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
    
    for epoch in range(10):
        model.train()
        total_loss = 0
        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
            x = blocks[0].srcdata['feat']
            y = blocks[-1].dstdata['label']
            #print(x.dtype)
            #print(y.dtype)
            y_hat = model(blocks, x)
            loss = F.cross_entropy(y_hat, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        auroc = evaluate(model, g, val_dataloader)
        print("Epoch {:05d} | Loss {:.4f} | AUROC {:.4f} ".format(epoch, total_loss / (it+1), auroc))


df_features = pd.read_csv('raw/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("raw/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("raw/elliptic_txs_classes.csv")

node_features, classified_idx, edge_index, weights, labels, y_train = process_ellipitc(df_features, df_edges, df_classes)
# converting data to PyGeometric graph data format

X_train, X_valid, y_train, y_valid, train_idx, valid_idx = train_test_split(node_features[classified_idx], y_train, classified_idx, test_size=0.15, random_state=42, stratify=y_train)
train_idx = torch.LongTensor(train_idx)
valid_idx = torch.LongTensor(valid_idx)


  from .autonotebook import tqdm as notebook_tqdm


torch.Size([2, 234355])


# Cugraph Training

### CuGraphStorage Construction

In [2]:
from torch.utils.dlpack import to_dlpack
import cupy as cp

def convert_to_column_major(t):
    return t.t().contiguous().t()

### Converting from tensors to cudf
edge_df = cudf.from_dlpack(to_dlpack(convert_to_column_major(edge_index.t())))
edge_df['edge_id'] = cp.arange(0,len(edge_df))
edge_df.columns = ['src','dst','edge_id']

node_feat_df = cudf.from_dlpack(to_dlpack(convert_to_column_major(node_features))).astype(cp.float32)
node_feat_df['node_id'] = cp.arange(0,len(node_feat_df))


node_label_df = cudf.DataFrame({'label':labels})
node_label_df['node_id'] =  cp.arange(0,len(node_label_df))

##### creating a Graphstore from cuDF dataframes
import cugraph
pg = cugraph.experimental.PropertyGraph()
# create gs from pg
gs = dgl.contrib.cugraph.CuGraphStorage(pg)
gs.add_edge_data(edge_df, ["src", "dst"], "edge_id")

### Set node type setting to DGL default
gs.add_node_data(node_feat_df, "node_id", 'feat', ntype='_N')
gs.add_node_data(node_label_df, "node_id", 'label', ntype='_N')

### Model Creation

In [3]:
# create GraphSAGE model
in_size = gs.ndata['feat'].shape[1]
out_size = 2
model = SAGE(in_size, 256, out_size).to(torch.device('cuda'))

### Training on Graph Store

In [4]:
%%time
print('Training...')
train({}, torch.device('cuda'), gs, train_idx.to('cuda'), valid_idx.to('cuda'), model)

Training...
Epoch 00000 | Loss 1.0093 | AUROC 0.9047 
Epoch 00001 | Loss 0.3802 | AUROC 0.9287 
Epoch 00002 | Loss 0.3079 | AUROC 0.9372 
Epoch 00003 | Loss 0.2520 | AUROC 0.9446 
Epoch 00004 | Loss 0.2117 | AUROC 0.9502 
Epoch 00005 | Loss 0.2024 | AUROC 0.9564 
Epoch 00006 | Loss 0.1801 | AUROC 0.9582 
Epoch 00007 | Loss 0.1622 | AUROC 0.9623 
Epoch 00008 | Loss 0.1503 | AUROC 0.9662 
Epoch 00009 | Loss 0.1416 | AUROC 0.9682 
CPU times: user 36.1 s, sys: 2.78 s, total: 38.9 s
Wall time: 32.8 s


# DGL Training

### DGL Graph Creation

In [5]:
g = dgl.graph((edge_index[0],edge_index[1]))
g.ndata['feat'] = node_features.to(torch.float)
g.ndata['label'] = torch.LongTensor(labels)
g = g.to('cuda')

### Model Creation

In [9]:
# create GraphSAGE model
in_size = g.ndata['feat'].shape[1]
out_size = 2
model = SAGE(in_size, 256, out_size).to(torch.device('cuda'))

### Training on DGL Graph

In [10]:
%%time
print('Training...')
train({}, torch.device('cuda'), g, train_idx.to('cuda'), valid_idx.to('cuda'), model)

Training...
Epoch 00000 | Loss 0.7836 | AUROC 0.9209 
Epoch 00001 | Loss 0.3531 | AUROC 0.9365 
Epoch 00002 | Loss 0.2865 | AUROC 0.9438 
Epoch 00003 | Loss 0.2319 | AUROC 0.9488 
Epoch 00004 | Loss 0.2017 | AUROC 0.9524 
Epoch 00005 | Loss 0.1874 | AUROC 0.9576 
Epoch 00006 | Loss 0.1671 | AUROC 0.9590 
Epoch 00007 | Loss 0.1577 | AUROC 0.9630 
Epoch 00008 | Loss 0.1489 | AUROC 0.9626 
Epoch 00009 | Loss 0.1423 | AUROC 0.9648 
CPU times: user 10.9 s, sys: 257 ms, total: 11.1 s
Wall time: 4.79 s
