In [26]:
import os.path as osp
import os

import torch
from sklearn.metrics import average_precision_score, roc_auc_score
from torch.nn import Linear

from overflowDataset import OpenFlowDataset
from torch_geometric.loader import TemporalDataLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import NeighborLoader
from tqdm import tqdm
from neighbor_sampler import NeighborSampler


In [2]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
__file__ = os.path.abspath('')
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OpenFlow')
dataset = OpenFlowDataset(path)
data = dataset[0]

print(data)

# # Ensure to only sample actual destination nodes as negatives.
# min_dst_idx, max_dst_idx = int(data.dst.min()), int(data.dst.max())
# train_data, val_data, test_data = data.train_val_test_split(
#     val_ratio=0.15, test_ratio=0.15)

# train_loader = TemporalDataLoader(train_data, batch_size=200)
# val_loader = TemporalDataLoader(val_data, batch_size=200)
# test_loader = TemporalDataLoader(test_data, batch_size=200)

Data(x=[63497050, 128], edge_index=[2, 63497050], edge_attr=[63497050])


In [5]:
data.n_id = torch.arange(data.num_nodes)

In [8]:
data

Data(x=[63497050, 128], edge_index=[2, 63497050], edge_attr=[63497050], n_id=[63497050])

In [6]:
train_loader = NeighborLoader(data, input_nodes=None, num_neighbors=[0], shuffle=False, batch_size=1)

In [9]:
cnt = 0
for b in train_loader:
  cnt += 1
  print(b.n_id)
  if cnt ==10:
    break
print(cnt)

tensor([0])
tensor([      1,   42368,    3619,   18255,   18061,   97614,      17,   17174,
          42303,   61027,    3407,  113471, 1185555,  112079,   51816,  163338,
         194476,   70289,   32498,    1242,    5358,   83132, 1960071,  266882,
           8945,  288393,  503734,   47341,  162167,  159270,     146,   93468,
         353729,  994771,  479491,    6258,   32303,   84704,   13523,   80901,
         146077,   23354, 3629119, 3159971,  302908,  571407, 5157280,  345708,
          73070,     115,  231704,  176877,   15497,  127259,  264074,   89771,
          11374,   17034, 1164573,  990666,   18174,   44620,   68589,   38567,
          42082,    2443,  563577, 4819357,  111929,   68204,    1043,   48015,
         483347,  202325,   63651,    8722, 2507850,  398630,   45433,   47064,
         653866,  187606,   58839, 1302881,  110337,  683352,    8220,  125756,
         252591])
tensor([      2,  143473,     615,     227,   10117,  281614,  149341,  291907,
          

In [29]:
sampled_data = next(iter(train_loader))
print(sampled_data.n_id)
sampled_data = next(iter(train_loader))
print(sampled_data.n_id)

tensor([0])
tensor([0])


In [24]:
edge_idx = torch.stack([data.src, data.dst], dim=0)
edge_idx.size()

torch.Size([2, 63497050])

In [9]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.lin_src = Linear(in_channels, in_channels)
        self.lin_dst = Linear(in_channels, in_channels)
        self.lin_final = Linear(in_channels, 1)

    def forward(self, z_src, z_dst):
        h = self.lin_src(z_src) + self.lin_dst(z_dst)
        h = h.relu()
        return self.lin_final(h)

In [10]:
class SAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < len(self.convs) - 1:
                x = x.relu_()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    @torch.no_grad()
    def inference(self, x_all, subgraph_loader):
        pbar = tqdm(total=len(subgraph_loader.dataset) * len(self.convs))
        pbar.set_description('Evaluating')

        # Compute representations of nodes layer by layer, using *all*
        # available edges. This leads to faster computation in contrast to
        # immediately computing the final representations of each batch:
        for i, conv in enumerate(self.convs):
            xs = []
            for batch in subgraph_loader:
                x = x_all[batch.n_id.to(x_all.device)].to(device)
                x = conv(x, batch.edge_index.to(device))
                if i < len(self.convs) - 1:
                    x = x.relu_()
                xs.append(x[:batch.batch_size].cpu())
                pbar.update(batch.batch_size)
            x_all = torch.cat(xs, dim=0)
        pbar.close()
        return x_all

In [17]:
sampleNum = 100000

In [18]:
train_loader = NeighborSampler(data.edge_index, sizes=[10,10], node_idx=data.edge_index[0][:sampleNum])

In [19]:
from functools import lru_cache

cacheMiss = 0

@lru_cache(maxsize=200000)
def get_value(key):
    global cacheMiss
    cacheMiss +=1

pbar = tqdm(total=sampleNum)
for step, (batch_size, ids, adjs) in enumerate(train_loader):
    for i in ids:
        get_value(i)
    pbar.update(1)
pbar.close()


100%|██████████| 100000/100000 [00:38<00:00, 2594.56it/s]


In [20]:
cacheMiss

7383305

In [21]:
import random

indices = torch.tensor(random.sample(range(len(data.edge_index[0])), sampleNum))
indices = torch.tensor(indices)
sampled_values = data.edge_index[0][indices]


  indices = torch.tensor(indices)


In [22]:
random_loader = NeighborSampler(data.edge_index, sizes=[10,10], node_idx=sampled_values)

In [23]:
from functools import lru_cache

cacheMiss = 0

@lru_cache(maxsize=200000)
def get_value(key):
    global cacheMiss
    cacheMiss +=1

pbar = tqdm(total=sampleNum)
for step, (batch_size, ids, adjs) in enumerate(random_loader):
    for i in ids:
        get_value(i)
    pbar.update(1)
pbar.close()

print(cacheMiss)

100%|██████████| 100000/100000 [00:34<00:00, 2918.50it/s]

5735639



