### Notebook Setup

In [171]:
user_net_id = 'aks9136'

import warnings
warnings.filterwarnings('ignore')

import dgl 
from dgl.nn.pytorch.conv import SAGEConv
import numpy as np 
import json 
import torch
from tqdm import tqdm 

import sys
sys.path.append('/scratch/' + user_net_id + '/NYU-Zillow-Capstone-2022-Team-A')
import src.datamodules.SAGE as g_train

### Part 1: Node Embedding Prediction

In [2]:
####################################
## Sample with Correct Formatting ##
####################################

dataset = dgl.data.CSVDataset('./graph_csv')
g = dataset[0]

embedding_length = g.ndata['feat'].size()[1]

conv = SAGEConv(embedding_length, embedding_length, 'mean')
res = conv(g, g.ndata['feat'])

Done loading data from cached files.


In [3]:
##################################
## Train -> Evaluation Pipeline ##
##################################

new_node_embeddings = g_train.run(g, 'pool')

Training...
Epoch 00000 | Loss 0.6846 | Distance Reduced 0.0000 %
Epoch 00025 | Loss 0.0418 | Distance Reduced 0.7027 %
Epoch 00050 | Loss 0.0247 | Distance Reduced 0.7657 %
Epoch 00075 | Loss 0.0218 | Distance Reduced 0.7795 %
Epoch 00100 | Loss 0.0208 | Distance Reduced 0.7846 %
Epoch 00125 | Loss 0.0202 | Distance Reduced 0.7874 %
Epoch 00150 | Loss 0.0199 | Distance Reduced 0.7890 %
Epoch 00175 | Loss 0.0198 | Distance Reduced 0.7899 %
Epoch 00200 | Loss 0.0197 | Distance Reduced 0.7906 %
Epoch 00225 | Loss 0.0196 | Distance Reduced 0.7910 %
Epoch 00250 | Loss 0.0195 | Distance Reduced 0.7913 %
Epoch 00275 | Loss 0.0195 | Distance Reduced 0.7915 %
Epoch 00300 | Loss 0.0195 | Distance Reduced 0.7917 %
Epoch 00325 | Loss 0.0194 | Distance Reduced 0.7919 %
Epoch 00350 | Loss 0.0194 | Distance Reduced 0.7920 %
Epoch 00375 | Loss 0.0194 | Distance Reduced 0.7921 %
Epoch 00400 | Loss 0.0194 | Distance Reduced 0.7922 %
Epoch 00425 | Loss 0.0194 | Distance Reduced 0.7923 %
Epoch 00450 | Lo

In [9]:
######################################
## Update Predicted Node Embeddings ##
######################################

## Adding feature (not overwriting)

g.ndata['feat_pred'] = new_node_embeddings
dgl.data.utils.save_graphs('graph_csv/coco_val_graph/coco_val_graph.bin', [g])

### Part 2: Link Prediction

In [6]:
import scipy.sparse as sp

In [208]:
######################
## Data Preperation ##
######################

test_frac = 0.1

## Positive Edges 

# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * test_frac)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

In [220]:
## Negative Edges

# Find all negative edges and split them for training and testing
tag_mask = list(([1] * 80) + ([0] * 5000))
tag_loc = torch.ByteTensor(tag_mask)

shortened_adj_matrix = g.adjacency_matrix().clone().to_dense()
shortened_adj_matrix = torch.transpose(torch.transpose(shortened_adj_matrix,0,1)[tag_loc], 0,1)

for i in tqdm(range(shortened_adj_matrix.size()[1], shortened_adj_matrix.size()[0])):
    t = shortened_adj_matrix[i]
    new_vs = (t == 1.0).nonzero(as_tuple=False)
    
    try:
        neg_v_tensor = torch.cat((neg_v_tensor, new_vs))  
    except:
        neg_v_tensor = new_vs


    new_us = [i] * new_vs.size(dim=0)

    try:
        neg_u_tensor = torch.cat((neg_u_tensor, torch.Tensor(new_us)))  
    except:
        neg_u_tensor = torch.Tensor(new_us)

neg_v_tensor = neg_v_tensor.squeeze()

import random
negative_indices = list(range(0, len(neg_v_tensor)))
random.shuffle(negative_indices)

train_indices = negative_indices[test_size:]
test_indices = negative_indices[:test_size]

test_neg_u = neg_u_tensor[test_indices]
test_neg_v = neg_v_tensor[test_indices]

train_neg_u = neg_u_tensor[train_indices]
train_neg_v = neg_v_tensor[train_indices]


100%|██████████| 5000/5000 [00:00<00:00, 35365.72it/s]


In [None]:
###########################
## Create Training Graph ##
###########################

train_g = dgl.remove_edges(g, eids[:test_size])


