### Imports

In [39]:
import argparse, time

import dgl
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgi import Classifier, DGI
from dgl import DGLGraph
from dgl.data import load_data, register_data_args, DGLDataset

import os


time: 1.16 ms (started: 2023-05-18 20:24:57 -05:00)


In [40]:
from tqdm import tqdm
import pandas as pd

os.environ["DGLBACKEND"] = "pytorch"
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1 ms (started: 2023-05-18 20:24:58 -05:00)


In [41]:
import sys
sys.path.append("../")

from utils.utils import *

time: 723 µs (started: 2023-05-18 20:24:58 -05:00)


In [42]:
torch.manual_seed(42)
np.random.seed(42)

time: 546 µs (started: 2023-05-18 20:24:58 -05:00)


In [43]:
def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

time: 1.14 ms (started: 2023-05-18 20:24:58 -05:00)


### Parameters

In [None]:
import json
  
# Opening JSON file
file = open("parameters.json")
params = json.load(file)

dir = os.path.dirname(os.path.dirname(os.getcwd()))
print(dir)

method = params["method"][params["method_idx"]]
print("Method:\t\t", method)

group = params["group"][params["group_idx"]]
print("Group:\t\t", group)

subgroups = [str(k + 1) for k in range(group[1])]
print("Subgroup:\t", subgroups)

dimensions = params["dimensions"]
print("Dimensions:\t", dimensions)

### Node embeddings

In [45]:
# Custom dataset

class CustomDataset(DGLDataset):
    def __init__(self, name, dir, group, subgroup):
        self.dir = dir
        self.group = group
        self.subgroup = subgroup
        super().__init__(name=name)
       
    def process(self):
        nodes_data = pd.read_csv("{}/output_preprocessing/graph_data/{}_nodes_data_{}.csv".format(self.dir, self.group, self.subgroup))
        edges_data = pd.read_csv("{}/output_preprocessing/graph_data/{}_edges_data_{}.csv".format(self.dir, self.group, self.subgroup))
        
        node_features = torch.from_numpy(nodes_data["degree"].to_numpy())
        node_features = node_features.to(torch.float32)
        node_features = torch.reshape(node_features, (-1, 1))

        node_labels = torch.from_numpy(nodes_data["ionMz"].to_numpy())
        node_labels = node_labels.to(torch.float32)

        edge_features = torch.from_numpy(edges_data["weight"].to_numpy())
        edges_src = torch.from_numpy(edges_data["source"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["target"].to_numpy())

        self.graph = dgl.graph(
            (edges_src, edges_dst), num_nodes=nodes_data.shape[0]
        )
        self.graph.ndata["feat"] = node_features
        self.graph.ndata["label"] = node_labels
        self.graph.edata["weight"] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train : n_train + n_val] = True
        test_mask[n_train + n_val :] = True
        self.graph.ndata["train_mask"] = train_mask
        self.graph.ndata["val_mask"] = val_mask
        self.graph.ndata["test_mask"] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

time: 3.88 ms (started: 2023-05-18 20:24:58 -05:00)


In [46]:
dataset = CustomDataset("g1", dir, group[0], 1)
graph = dataset[0]

print(graph)

Graph(num_nodes=6234, num_edges=1243057,
      ndata_schemes={'feat': Scheme(shape=(1,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
time: 222 ms (started: 2023-05-18 20:24:59 -05:00)


In [47]:
# Params

parser = argparse.ArgumentParser(description="DGI")
# register_data_args(parser)
parser.add_argument(
    "--dataset",
    type=str,
    default="cora",
    required=False,
    help="The input dataset. Can be cora, citeseer, pubmed, syn(synthetic dataset) or reddit",
)
parser.add_argument(
    "--dropout", type=float, default=0.0, help="dropout probability"
)
parser.add_argument("--gpu", type=int, default=0, help="gpu")
parser.add_argument(
    "--dgi-lr", type=float, default=1e-3, help="dgi learning rate"
)
parser.add_argument(
    "--classifier-lr",
    type=float,
    default=1e-2,
    help="classifier learning rate",
)
parser.add_argument(
    "--n-dgi-epochs",
    type=int,
    default=300,
    help="number of training epochs",
)
parser.add_argument(
    "--n-classifier-epochs",
    type=int,
    default=300,
    help="number of training epochs",
)
parser.add_argument(
    "--n-hidden", type=int, default=2, help="number of hidden gcn units"
)
parser.add_argument(
    "--n-layers", type=int, default=3, help="number of hidden gcn layers"
)
parser.add_argument(
    "--weight-decay", type=float, default=0.0, help="Weight for L2 loss"
)
parser.add_argument(
    "--patience", type=int, default=20, help="early stop patience condition"
)
parser.add_argument(
    "--self-loop",
    action="store_true",
    help="graph self-loop (default=False)",
)
parser.set_defaults(self_loop=False)
args = parser.parse_args("")
print(args)

Namespace(dataset='cora', dropout=0.0, gpu=0, dgi_lr=0.001, classifier_lr=0.01, n_dgi_epochs=300, n_classifier_epochs=300, n_hidden=2, n_layers=3, weight_decay=0.0, patience=20, self_loop=False)
time: 3.53 ms (started: 2023-05-18 20:24:59 -05:00)


In [48]:
# Get node embeddings

# list_embeddings_time = []
for dimension in dimensions:
    # Get embeddings
    parser.set_defaults(self_loop=True)
    parser.set_defaults(n_hidden=dimension)
    parser.set_defaults(n_layers=3)
    args = parser.parse_args("")
    # print(args)
    
    for i in tqdm(subgroups):
        # Read dataset

        # load and preprocess dataset
        # data = load_data(args)
        data = CustomDataset("g".format(i), dir, group[0], i)

        g = data[0]
        # print("x", g)

        features = torch.FloatTensor(np.log10(g.ndata["feat"]))
        print(features.shape)
        # labels = torch.LongTensor(g.ndata["label"])
        if hasattr(torch, "BoolTensor"):
            train_mask = torch.BoolTensor(g.ndata["train_mask"])
            val_mask = torch.BoolTensor(g.ndata["val_mask"])
            test_mask = torch.BoolTensor(g.ndata["test_mask"])
        else:
            train_mask = torch.ByteTensor(g.ndata["train_mask"])
            val_mask = torch.ByteTensor(g.ndata["val_mask"])
            test_mask = torch.ByteTensor(g.ndata["test_mask"])
        in_feats = features.shape[1]
        # n_classes = data.num_classes
        n_edges = g.num_edges()

        if args.gpu < 0:
            cuda = False
        else:
            cuda = True
            torch.cuda.set_device(args.gpu)
            features = features.cuda()
            # labels = labels.cuda()
            train_mask = train_mask.cuda()
            val_mask = val_mask.cuda()
            test_mask = test_mask.cuda()

        # add self loop
        if args.self_loop:
            print("self_loop")
            g = dgl.remove_self_loop(g)
            g = dgl.add_self_loop(g)
        n_edges = g.num_edges()

        if args.gpu >= 0:
            g = g.to(args.gpu)
        # create DGI model
        dgi = DGI(
            g,
            in_feats,
            args.n_hidden,
            args.n_layers,
            nn.PReLU(args.n_hidden),
            args.dropout,
        )

        if cuda:
            dgi.cuda()

        dgi_optimizer = torch.optim.Adam(
            dgi.parameters(), lr=args.dgi_lr, weight_decay=args.weight_decay
        )

        # train deep graph infomax
        cnt_wait = 0
        best = 1e9
        best_t = 0
        dur = []
        for epoch in range(args.n_dgi_epochs):
            dgi.train()
            if epoch >= 3:
                t0 = time.time()

            dgi_optimizer.zero_grad()
            loss = dgi(features)
            loss.backward()
            dgi_optimizer.step()

            if loss < best:
                best = loss
                best_t = epoch
                cnt_wait = 0
                torch.save(dgi.state_dict(), "best_dgi.pkl")
            else:
                cnt_wait += 1

            if cnt_wait == args.patience:
                print("Early stopping!")
                break

            if epoch >= 3:
                dur.append(time.time() - t0)

            """ print(
                "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
                "ETputs(KTEPS) {:.2f}".format(
                    epoch, np.mean(dur), loss.item(), n_edges / np.mean(dur) / 1000
                )
            ) """

        embeds = dgi.encoder(features, corrupt=False)
        embeds = embeds.cpu().detach()

        df_node_embeddings = pd.DataFrame(data=embeds)
        df_node_embeddings

        # save
        df_node_embeddings.to_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], dimension, i), index=True)
        # print("Save node embeddings")

  0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([6234, 1])
self_loop


  assert input.numel() == input.storage().size(), (
  assert input.numel() == input.storage().size(), (


torch.Size([6231, 1])
self_loop


 40%|████      | 2/5 [00:09<00:14,  4.99s/it]

torch.Size([6233, 1])
self_loop


  assert input.numel() == input.storage().size(), (
 60%|██████    | 3/5 [00:15<00:10,  5.11s/it]

torch.Size([6226, 1])
self_loop


  assert input.numel() == input.storage().size(), (
 80%|████████  | 4/5 [00:20<00:05,  5.31s/it]

torch.Size([6226, 1])
self_loop


  assert input.numel() == input.storage().size(), (
100%|██████████| 5/5 [00:26<00:00,  5.29s/it]

time: 26.5 s (started: 2023-05-18 20:24:59 -05:00)





In [49]:
df_node_embeddings = pd.read_csv("{}/output_{}/node_embeddings/{}_node-embeddings_{}_{}.csv".format(dir, method, group[0], 3, 1), index_col=0)
df_node_embeddings

Unnamed: 0,0,1,2
0,0.073444,0.171685,-0.062233
1,0.039632,0.171730,-0.106279
2,0.073368,0.171690,-0.062339
3,0.032038,0.171898,-0.116451
4,0.059988,0.171999,-0.080284
...,...,...,...
6229,-0.025069,0.144772,-0.143681
6230,0.304509,0.137119,0.299618
6231,0.419120,0.147069,0.430878
6232,0.445898,0.127435,0.500044


time: 19.4 ms (started: 2023-05-18 20:25:26 -05:00)
