### Imports

In [24]:
import argparse
import os
import time

import dgl

import model
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn.functional as F
from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
# from input_data import load_data

import torch
from dgl.data import DGLDataset

from preprocess import (
    mask_test_edges,
    mask_test_edges_dgl,
    preprocess_graph,
    sparse_to_tuple,
)
from sklearn.metrics import average_precision_score, roc_auc_score

os.environ["DGLBACKEND"] = "pytorch"

time: 1.61 ms (started: 2023-05-18 22:11:36 -05:00)


In [25]:
from tqdm import tqdm
import pandas as pd
import networkx as nx
import numpy as np

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.03 ms (started: 2023-05-18 22:11:37 -05:00)


In [26]:
import sys
sys.path.append("../")

from utils.utils import *

time: 757 µs (started: 2023-05-18 22:11:37 -05:00)


In [27]:
torch.manual_seed(42)
np.random.seed(42)

time: 2.63 ms (started: 2023-05-18 22:11:37 -05:00)


### Node embeddings

In [28]:
dir = os.path.dirname(os.path.dirname(os.getcwd()))
print(dir)

method = ["dgi", "vgae"]
method = method[1]
print("Method:\t\t", method)

group = [("WT", 5), ("zwf1^", 3), ("pck1^", 2)]
group = group[0]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = [3] # [2, 4, 8, 16, 32, 64, 128, 256]
print("Dimensions:\t", dimensions)

/home/ealvarez/Project/GNN_Filter
Method:		 vgae
Group:		 ('WT', 5)
Subgroup:	 ['1', '2', '3', '4', '5']
Dimensions:	 [3]
time: 2.18 ms (started: 2023-05-18 22:11:37 -05:00)


In [29]:
# Custom dataset

class CustomDataset(DGLDataset):
    def __init__(self, name, dir, group, subgroup):
        self.dir = dir
        self.group = group
        self.subgroup = subgroup
        super().__init__(name=name)
       
    def process(self):
        nodes_data = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(self.dir, self.group, self.subgroup))
        edges_data = pd.read_csv("{}/output_preprocessing/graph_data/{}_edges_data_{}.csv".format(self.dir, self.group, self.subgroup))
        
        node_features = torch.from_numpy(np.log10(nodes_data["degree"].to_numpy()))
        node_features = node_features.to(torch.float32)
        node_features = torch.reshape(node_features, (-1, 1))

        node_labels = torch.from_numpy(nodes_data["ionMz"].to_numpy())
        node_labels = node_labels.to(torch.float32)

        edge_features = torch.from_numpy(edges_data["weight"].to_numpy())
        edges_src = torch.from_numpy(edges_data["source"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["target"].to_numpy())

        self.graph = dgl.graph(
            (edges_src, edges_dst), num_nodes=nodes_data.shape[0]
        )
        self.graph.ndata["feat"] = node_features
        self.graph.ndata["label"] = node_labels
        self.graph.edata["weight"] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train : n_train + n_val] = True
        test_mask[n_train + n_val :] = True
        self.graph.ndata["train_mask"] = train_mask
        self.graph.ndata["val_mask"] = val_mask
        self.graph.ndata["test_mask"] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

time: 1.81 ms (started: 2023-05-18 22:11:38 -05:00)


In [30]:
dataset = CustomDataset("g1", dir, group[0], 1)
graph = dataset[0]

print(graph)

Graph(num_nodes=6234, num_edges=1243057,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
time: 214 ms (started: 2023-05-18 22:11:38 -05:00)


In [31]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

parser = argparse.ArgumentParser(description="Variant Graph Auto Encoder")
parser.add_argument(
    "--learning_rate", type=float, default=0.01, help="Initial learning rate."
)
parser.add_argument(
    "--epochs", "-e", type=int, default=100, help="Number of epochs to train."
)
parser.add_argument(
    "--hidden1",
    "-h1",
    type=int,
    default=32,
    help="Number of units in hidden layer 1.",
)
parser.add_argument(
    "--hidden2",
    "-h2",
    type=int,
    default=3,
    help="Number of units in hidden layer 2.",
)
parser.add_argument(
    "--datasrc",
    "-s",
    type=str,
    default="dgl",
    help="Dataset download from dgl Dataset or website.",
)
parser.add_argument(
    "--dataset", "-d", type=str, default="cora", help="Dataset string."
)
parser.add_argument("--gpu_id", type=int, default=0, help="GPU id to use.")
args = parser.parse_args("")


# check device
device = torch.device(
    "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() else "cpu"
)
device = "cpu"
device
# roc_means = []
# ap_means = []

'cpu'

time: 7.72 ms (started: 2023-05-18 22:11:38 -05:00)


In [32]:
def compute_loss_para(adj):
    pos_weight = (adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = (
        adj.shape[0]
        * adj.shape[0]
        / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
    )
    weight_mask = adj.view(-1) == 1
    weight_tensor = torch.ones(weight_mask.size(0)).to(device)
    weight_tensor[weight_mask] = pos_weight
    return weight_tensor, norm


def get_acc(adj_rec, adj_label):
    labels_all = adj_label.view(-1).long()
    preds_all = (adj_rec > 0.5).view(-1).long()
    accuracy = (preds_all == labels_all).sum().float() / labels_all.size(0)
    return accuracy


def get_scores(edges_pos, edges_neg, adj_rec):
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    adj_rec = adj_rec.cpu()
    # Predict on test set of edges
    preds = []
    for e in edges_pos:
        preds.append(sigmoid(adj_rec[e[0], e[1]].item()))

    preds_neg = []
    for e in edges_neg:
        preds_neg.append(sigmoid(adj_rec[e[0], e[1]].data))

    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)

    return roc_score, ap_score

time: 1.29 ms (started: 2023-05-18 22:11:39 -05:00)


In [33]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

time: 369 µs (started: 2023-05-18 22:11:39 -05:00)


In [34]:
# Get node embeddings

# list_embeddings_time = []
for dimension in tqdm(dimensions):
    # Get embeddings
    """ parser.set_defaults(self_loop=True)
    parser.set_defaults(n_hidden=dimension)
    parser.set_defaults(n_layers=3)
    args = parser.parse_args("") """
    # print(args)

    for i in tqdm(subgroups):
        # Read dataset

        data = CustomDataset("g".format(i), dir, group[0], i)
        graph = data[0]
        print(graph)

        # Extract node features
        feats = graph.ndata.pop("feat").to(device)
        in_dim = feats.shape[-1]

        # generate input
        adj_orig = graph.adj_external().to_dense()

        # build test set with 10% positive links
        (
            train_edge_idx,
            val_edges,
            val_edges_false,
            test_edges,
            test_edges_false,
        ) = mask_test_edges_dgl(graph, adj_orig)

        graph = graph.to(device)

        # create train graph
        train_edge_idx = torch.tensor(train_edge_idx).to(device)
        train_graph = dgl.edge_subgraph(graph, train_edge_idx, relabel_nodes=False)
        train_graph = train_graph.to(device)
        adj = train_graph.adj_external().to_dense().to(device)

        # compute loss parameters
        weight_tensor, norm = compute_loss_para(adj)

        # create model
        vgae_model = model.VGAEModel(in_dim, args.hidden1, args.hidden2)
        vgae_model = vgae_model.to(device)

        # create training component
        optimizer = torch.optim.Adam(vgae_model.parameters(), lr=args.learning_rate)
        print(
            "Total Parameters:",
            sum([p.nelement() for p in vgae_model.parameters()]),
        )

        # create training epoch
        for epoch in tqdm(range(args.epochs)):
            t = time.time()

            # Training and validation using a full graph
            vgae_model.train()

            logits = vgae_model.forward(graph, feats)

            # compute loss
            loss = norm * F.binary_cross_entropy(
                logits.view(-1), adj.view(-1), weight=weight_tensor
            )
            kl_divergence = (
                0.5
                / logits.size(0)
                * (
                    1
                    + 2 * vgae_model.log_std
                    - vgae_model.mean**2
                    - torch.exp(vgae_model.log_std) ** 2
                )
                .sum(1)
                .mean()
            )
            loss -= kl_divergence

            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # train_acc = get_acc(logits, adj)

            # val_roc, val_ap = get_scores(val_edges, val_edges_false, logits)

            # Print out performance
            """ print(
                "Epoch:",
                "%04d" % (epoch + 1),
                "train_loss=",
                "{:.5f}".format(loss.item()),
                "train_acc=",
                "{:.5f}".format(train_acc),
                "val_roc=",
                "{:.5f}".format(val_roc),
                "val_ap=",
                "{:.5f}".format(val_ap),
                "time=",
                "{:.5f}".format(time.time() - t),
            ) """

        """ test_roc, test_ap = get_scores(test_edges, test_edges_false, logits)
        # roc_means.append(test_roc)
        # ap_means.append(test_ap)
        print(
            "End of training!",
            "test_roc=",
            "{:.5f}".format(test_roc),
            "test_ap=",
            "{:.5f}".format(test_ap),
        ) """

        embeds = vgae_model.encoder(graph, feats)
        embeds = embeds.cpu().detach()

        df_node_embeddings = pd.DataFrame(data=embeds)
        df_node_embeddings

        # save
        df_node_embeddings.to_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], dimension, i), index=True)
        # print("Save node embeddings")

  0%|                                                     | 0/1 [00:00<?, ?it/s]

Graph(num_nodes=6234, num_edges=1243057,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
Total Parameters: 262



  assert input.numel() == input.storage().size(), (

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████████████████████████████████████████| 100/100 [00:26<00:00,  3.76it/s]


Graph(num_nodes=6231, num_edges=939038,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
Total Parameters: 262



  assert input.numel() == input.storage().size(), (

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████████████████████████████████████████| 100/100 [00:26<00:00,  3.81it/s]


Graph(num_nodes=6233, num_edges=2252847,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
Total Parameters: 262



  assert input.numel() == input.storage().size(), (

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████████████████████████████████████████| 100/100 [00:26<00:00,  3.81it/s]


Graph(num_nodes=6226, num_edges=3728175,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
Total Parameters: 262



  assert input.numel() == input.storage().size(), (

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████████████████████████████████████████| 100/100 [00:26<00:00,  3.81it/s]


Graph(num_nodes=6226, num_edges=4071713,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
Total Parameters: 262



  assert input.numel() == input.storage().size(), (

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████████████████████████████████████████| 100/100 [00:26<00:00,  3.83it/s]
100%|█████████████████████████████████████████████| 5/5 [02:20<00:00, 28.13s/it]
100%|████████████████████████████████████████████| 1/1 [02:20<00:00, 140.67s/it]

time: 2min 20s (started: 2023-05-18 22:11:39 -05:00)





In [35]:
df_node_embeddings = pd.read_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], 3, 1), index_col=0)
df_node_embeddings

Unnamed: 0,0,1,2
0,-0.196577,0.695413,0.450436
1,0.122968,0.364990,-0.862415
2,-0.881289,1.320957,1.625736
3,0.031436,1.718528,-0.856446
4,0.053356,-0.113882,0.576290
...,...,...,...
6229,0.320602,0.141998,-0.145007
6230,0.691773,-0.049610,-0.045168
6231,-0.098969,-0.082001,0.264603
6232,0.152756,0.518421,0.969551


time: 19 ms (started: 2023-05-18 22:14:00 -05:00)
