In [3]:
import argparse
import os
import sys

# Simulate having cfg available by loading in hydra config as dict
import yaml
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

import dgl
import dgl.function as fn
import hydra
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import MeanMetric
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

user_net_id = os.getlogin()
home_path = '/scratch/' + user_net_id + '/projects/NYU-Zillow-Capstone-2022-Team-A'
if home_path not in sys.path:
    sys.path.append('/scratch/' + user_net_id + '/projects/NYU-Zillow-Capstone-2022-Team-A')

from src.datamodules.negative_sampler import NegativeSampler
from src.model.SAGE import SAGE


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Simulate having cfg available by loading in hydra config as dict
from types import SimpleNamespace

class NestedNamespace(SimpleNamespace):
    def __init__(self, dictionary, **kwargs):
        super().__init__(**kwargs)
        for key, value in dictionary.items():
            if isinstance(value, dict):
                self.__setattr__(key, NestedNamespace(value))
            else:
                self.__setattr__(key, value)

cfg = NestedNamespace(yaml.load(open('../conf/config.yaml'), Loader=Loader))

In [5]:
def to_bidirected_with_reverse_mapping(g):
    """Makes a graph bidirectional, and returns a mapping array ``mapping`` where ``mapping[i]``
    is the reverse edge of edge ID ``i``. Does not work with graphs that have self-loops.
    """
    g_simple, mapping = dgl.to_simple(
        dgl.add_reverse_edges(g), return_counts="count", writeback_mapping=True
    )
    c = g_simple.edata["count"]
    num_edges = g.num_edges()
    mapping_offset = torch.zeros(g_simple.num_edges() + 1, dtype=g_simple.idtype)
    mapping_offset[1:] = c.cumsum(0)
    idx = mapping.argsort()
    idx_uniq = idx[mapping_offset[:-1]]
    reverse_idx = torch.where(
        idx_uniq >= num_edges, idx_uniq - num_edges, idx_uniq + num_edges
    )
    reverse_mapping = mapping[reverse_idx]
    # sanity check
    src1, dst1 = g_simple.edges()
    src2, dst2 = g_simple.find_edges(reverse_mapping)
    assert torch.equal(src1, dst2)
    assert torch.equal(src2, dst1)
    return g_simple, reverse_mapping

In [6]:
class DataModule(LightningDataModule):
    def __init__(
        self,
        csv_dataset_root,
        data_cpu=False,
        fan_out=[10, 25],
        device="cpu",
        batch_size=1024,
        num_workers=4,
        force_reload=False,
    ):
        super().__init__()
        self.save_hyperparameters()
        dataset = dgl.data.CSVDataset(csv_dataset_root, force_reload=force_reload)
        g = dataset[0]
        g, reverse_eids = to_bidirected_with_reverse_mapping(g)
        # g = g.formats(["csc"])
        g = g.to(device)
        reverse_eids = reverse_eids.to(device)
        # seed_edges = torch.arange(g.num_edges()).to(device)

        train_nid = torch.nonzero(g.ndata["train_mask"], as_tuple=True)[0].to(device)
        val_nid = torch.nonzero(g.ndata["val_mask"], as_tuple=True)[0].to(device)
        test_nid = torch.nonzero(
            ~(g.ndata["train_mask"] | g.ndata["val_mask"]), as_tuple=True
        )[0].to(device)

        sampler = dgl.dataloading.MultiLayerNeighborSampler(
            [int(_) for _ in fan_out], prefetch_node_feats=["feat"]
        )

        self.g = g
        self.train_nid, self.val_nid, self.test_nid = train_nid, val_nid, test_nid
        self.sampler = sampler
        self.device = device
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.in_dim = g.ndata["feat"].shape[1]
        self.reverse_eids = reverse_eids

    def train_dataloader(self):
        sampler = dgl.dataloading.as_edge_prediction_sampler(
            self.sampler,
            exclude="reverse_id",
            reverse_eids=self.reverse_eids,
            negative_sampler=NegativeSampler(self.g, 5)
            # negative_sampler=dgl.dataloading.negative_sampler.PerSourceUniform(5),
        )

        return dgl.dataloading.DataLoader(
            self.g,
            self.train_nid,
            sampler,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=False,
            # num_workers=self.num_workers,
        )

    def val_dataloader(self):
        sampler = dgl.dataloading.as_edge_prediction_sampler(
            self.sampler,
            exclude="reverse_id",
            reverse_eids=self.reverse_eids,
            negative_sampler=NegativeSampler(self.g, 1)
            # negative_sampler=dgl.dataloading.negative_sampler.PerSourceUniform(5),
        )

        return dgl.dataloading.DataLoader(
            self.g,
            self.val_nid,
            sampler,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=False,
            # num_workers=self.num_workers,
        )

# train(cfg)
if not torch.cuda.is_available():
    device = "cpu"
else:
    device = "cuda"

datamodule = DataModule(cfg.data.zillow_root, device=device, batch_size=cfg.training.batch_size)


Done loading data from cached files.


In [7]:
# Exploring DataModule class
csv_dataset_root = cfg.data.zillow_root
force_reload=False
dataset = dgl.data.CSVDataset(csv_dataset_root, force_reload=force_reload)
g = dataset[0]

Done loading data from cached files.


In [8]:
g.edges()

(tensor([79966, 79966, 79966,  ..., 57469, 57469, 76505]),
 tensor([43923, 58296, 32091,  ..., 58296,   873, 69961]))

In [14]:
g_reverses = dgl.add_reverse_edges(g)
src_rv, dst_rv = g_reverses.edges()

num_edges = g.num_edges()
print('example original edge:', src_rv[0], dst_rv[0])
print('example reverse edge that got added:', src_rv[num_edges], dst_rv[num_edges])

g_simple, mapping = dgl.to_simple(g_reverses, return_counts="count", writeback_mapping=True)

src_simple, dst_simple = g_simple.edges()
print('example original edge after to_simple:', src_simple[num_edges-1], dst_rv[num_edges-1])


tensor(76505) tensor(69961)
tensor(69961) tensor(76505)


In [9]:
g_simple, mapping = dgl.to_simple(
        dgl.add_reverse_edges(g), return_counts="count", writeback_mapping=True
    )

# dgl.to_simple creates a new set of edge IDs. Whereas previously dgl.add_reverse_edges() just adds a reverse of each edge (thus number of unique edge IDs stays the same),
# dgl.to_simple makes each of these new edges created by add_reverse_edges() into their own unique edge, thus doubling the number of unique edge IDs relative to g

g_simple.edges()

(tensor([    1,     5,     7,  ..., 84244, 84244, 84244]),
 tensor([58768, 61026,  9183,  ..., 55507, 61636, 67164]))

In [1]:
g_simple.edges()

NameError: name 'g_simple' is not defined

In [42]:
mapping


tensor([ 22767,  18275,  85675,  ..., 109849,  62083, 117347])

In [43]:
c = g_simple.edata['count']
mapping_offset = torch.zeros(g_simple.num_edges() + 1, dtype=g_simple.idtype)
mapping_offset[1:] = c.cumsum(0)
mapping_offset

idx = mapping.argsort()
idx_uniq = idx[mapping_offset[:-1]]
idx_uniq


tensor([ 22767,  18275,  85675,  ..., 109849,  62083, 117347])

In [44]:
num_edges = g.num_edges()
reverse_idx = torch.where(idx_uniq >= num_edges, idx_uniq - num_edges, idx_uniq + num_edges)

tensor([     0,      1,      2,  ..., 122993, 122994, 122995])