### Notebook Setup

In [171]:
user_net_id = 'aks9136'

import warnings
warnings.filterwarnings('ignore')

import dgl 
from dgl.nn.pytorch.conv import SAGEConv
import numpy as np 
import json 
import torch
from tqdm import tqdm 

import sys
sys.path.append('/scratch/' + user_net_id + '/NYU-Zillow-Capstone-2022-Team-A')
import src.datamodules.SAGE as g_train

### Part 1: Node Embedding Prediction

In [2]:
####################################
## Sample with Correct Formatting ##
####################################

dataset = dgl.data.CSVDataset('./graph_csv')
g = dataset[0]

embedding_length = g.ndata['feat'].size()[1]

conv = SAGEConv(embedding_length, embedding_length, 'mean')
res = conv(g, g.ndata['feat'])

Done loading data from cached files.


In [3]:
##################################
## Train -> Evaluation Pipeline ##
##################################

new_node_embeddings = g_train.run(g, 'pool')

Training...
Epoch 00000 | Loss 0.6846 | Distance Reduced 0.0000 %
Epoch 00025 | Loss 0.0418 | Distance Reduced 0.7027 %
Epoch 00050 | Loss 0.0247 | Distance Reduced 0.7657 %
Epoch 00075 | Loss 0.0218 | Distance Reduced 0.7795 %
Epoch 00100 | Loss 0.0208 | Distance Reduced 0.7846 %
Epoch 00125 | Loss 0.0202 | Distance Reduced 0.7874 %
Epoch 00150 | Loss 0.0199 | Distance Reduced 0.7890 %
Epoch 00175 | Loss 0.0198 | Distance Reduced 0.7899 %
Epoch 00200 | Loss 0.0197 | Distance Reduced 0.7906 %
Epoch 00225 | Loss 0.0196 | Distance Reduced 0.7910 %
Epoch 00250 | Loss 0.0195 | Distance Reduced 0.7913 %
Epoch 00275 | Loss 0.0195 | Distance Reduced 0.7915 %
Epoch 00300 | Loss 0.0195 | Distance Reduced 0.7917 %
Epoch 00325 | Loss 0.0194 | Distance Reduced 0.7919 %
Epoch 00350 | Loss 0.0194 | Distance Reduced 0.7920 %
Epoch 00375 | Loss 0.0194 | Distance Reduced 0.7921 %
Epoch 00400 | Loss 0.0194 | Distance Reduced 0.7922 %
Epoch 00425 | Loss 0.0194 | Distance Reduced 0.7923 %
Epoch 00450 | Lo

In [9]:
######################################
## Update Predicted Node Embeddings ##
######################################

## Adding feature (not overwriting)

g.ndata['feat_pred'] = new_node_embeddings
dgl.data.utils.save_graphs('graph_csv/coco_val_graph/coco_val_graph.bin', [g])

### Part 2: Link Prediction

In [6]:
import scipy.sparse as sp

In [208]:
######################
## Data Preperation ##
######################

test_frac = 0.1

## Positive Edges 

# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * test_frac)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

In [249]:
## Negative Edges

# Find all negative edges and split them for training and testing
tag_mask = list(([1] * 80) + ([0] * 5000))
tag_loc = torch.ByteTensor(tag_mask)

shortened_adj_matrix = g.adjacency_matrix().clone().to_dense()
shortened_adj_matrix = torch.transpose(torch.transpose(shortened_adj_matrix,0,1)[tag_loc], 0,1)

for i in tqdm(range(shortened_adj_matrix.size()[1], shortened_adj_matrix.size()[0])):
    t = shortened_adj_matrix[i]
    new_vs = (t == 1.0).nonzero(as_tuple=False).type(torch.IntTensor)
    
    try:
        neg_v_tensor = torch.cat((neg_v_tensor, new_vs)).type(torch.IntTensor)  
    except:
        neg_v_tensor = new_vs


    new_us = [i] * new_vs.size(dim=0)
    new_us = torch.Tensor(new_us).type(torch.IntTensor)

    try:
        neg_u_tensor = torch.cat((neg_u_tensor, new_us)).type(torch.IntTensor)
    except:
        neg_u_tensor = new_us

neg_v_tensor = neg_v_tensor.squeeze()

import random
negative_indices = list(range(0, len(neg_v_tensor)))
random.shuffle(negative_indices)

train_indices = negative_indices[test_size:]
test_indices = negative_indices[:test_size]

test_neg_u = neg_u_tensor[test_indices]
test_neg_v = neg_v_tensor[test_indices]

train_neg_u = neg_u_tensor[train_indices]
train_neg_v = neg_v_tensor[train_indices]


100%|██████████| 5000/5000 [00:00<00:00, 21324.81it/s]


In [319]:
###########################
## Create Training Graph ##
###########################

device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_pos_g = dgl.graph((train_pos_u, train_pos_v)).to(device)
train_neg_g = dgl.graph((train_neg_u, train_neg_v)).to(device)

test_pos_g = dgl.graph((test_pos_u, test_pos_v)).to(device)
test_neg_g = dgl.graph((test_neg_u, test_neg_v)).to(device)

train_g = dgl.remove_edges(g, eids[:test_size])

remove_ind = np.random.choice(train_g.num_nodes(), train_g.num_nodes() -  test_pos_g.num_nodes())
train_g.remove_nodes(remove_ind)

remove_ind = np.random.choice(train_pos_g.num_nodes(), train_pos_g.num_nodes() -  test_pos_g.num_nodes())
train_pos_g.remove_nodes(remove_ind)

remove_ind = np.random.choice(train_neg_g.num_nodes(), train_neg_g.num_nodes() -  test_pos_g.num_nodes())
train_neg_g.remove_nodes(remove_ind)

remove_ind = np.random.choice(test_neg_g.num_nodes(), test_neg_g.num_nodes() -  test_pos_g.num_nodes())
test_neg_g.remove_nodes(remove_ind)


In [322]:
########################
## Predicton Function ##
########################

import dgl.function as fn

class DotPredictor(torch.nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g = g.to(device)
            g.ndata['h'] = h.to(device)
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [323]:
###########################
## Training Loop - Setup ##
###########################

import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

model = g_train.SAGE(train_g.ndata['feat'].shape[1], None ,train_g.ndata['feat'].shape[1], 'mean')
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

## Note: loss can be greater than one because labels are 1s and 0s 
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [324]:
#############################
## Training Loop Execution ##
#############################

import itertools

optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.05)

for e in range(300):
    # forward
    h = model(train_g, train_g.ndata['feat']).to(device)
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 100 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

In epoch 0, loss: 22.82440185546875
In epoch 100, loss: 0.481178343296051
In epoch 200, loss: 0.28091341257095337


In [325]:
with torch.no_grad():
    pos_score = pred(test_pos_g.to(device), h.to(device))
    neg_score = pred(test_neg_g.to(device), h.to(device))
    print('AUC', compute_auc(pos_score, neg_score))

AUC 0.5406468827521459
