### Imports

In [15]:
import argparse, time

import dgl
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgi import Classifier, DGI
from dgl import DGLGraph
from dgl.data import load_data, register_data_args, DGLDataset

import os

time: 910 µs (started: 2023-10-20 13:41:57 -05:00)


In [16]:
from tqdm import tqdm
import pandas as pd

os.environ["DGLBACKEND"] = "pytorch"
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 12.2 ms (started: 2023-10-20 13:41:57 -05:00)


In [17]:
import sys
sys.path.append("../")

from utils.utils_go import *

time: 782 µs (started: 2023-10-20 13:41:57 -05:00)


In [18]:
# torch.manual_seed(42)
# np.random.seed(42)

time: 386 µs (started: 2023-10-20 13:41:57 -05:00)


In [19]:
def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

time: 1.24 ms (started: 2023-10-20 13:41:57 -05:00)


### Parameters

In [20]:
import json

# dir = os.path.dirname(os.path.dirname(os.getcwd()))
dir = os.path.dirname(os.getcwd())
print(dir)

# opening JSON file
file = open("{}/parameters.json".format(dir))
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = "dgi"
print("Method:\t\t", method)

dimension = params["dimension"]
print("Dimension:\t", dimension)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

option = params["option"]
print("Option:\t\t", option)

options = params["options"]
print("Options:\t", options)

""" if option:
    for group in groups_id:
        subgroups_id[group] = [option]
    print("Subgroups id:\t", subgroups_id) """

/home/ealvarez/Project/GNN_Unsupervised


FileNotFoundError: [Errno 2] No such file or directory: '/home/ealvarez/Project/GNN_Unsupervised/parameters.json'

time: 151 ms (started: 2023-10-20 13:41:57 -05:00)


### Node embeddings

In [None]:
# custom dataset

class CustomDataset(DGLDataset):
    def __init__(self, name, nodes_data, edges_data):
        self.dir = dir
        self.nodes_data = nodes_data
        self.edges_data = edges_data
        super().__init__(name=name)
       
    def process(self):
        node_features = torch.from_numpy(self.nodes_data.to_numpy())
        # node_features = torch.from_numpy(np.log10(self.nodes_data["degree"].to_numpy()))
        node_features = node_features.to(torch.float32)
        # node_features = torch.reshape(node_features, (-1, 1))

        # node_labels = torch.from_numpy(self.nodes_data["id"].to_numpy())
        # node_labels = node_labels.to(torch.float32)

        edge_features = torch.from_numpy(self.edges_data["weight"].to_numpy())
        edges_src = torch.from_numpy(self.edges_data["source"].to_numpy())
        edges_dst = torch.from_numpy(self.edges_data["target"].to_numpy())

        self.graph = dgl.graph(
            (edges_src, edges_dst), num_nodes=self.nodes_data.shape[0]
        )
        self.graph.ndata["feat"] = node_features
        # self.graph.ndata["label"] = node_labels
        self.graph.edata["weight"] = edge_features

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = self.nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train : n_train + n_val] = True
        test_mask[n_train + n_val :] = True
        self.graph.ndata["train_mask"] = train_mask
        self.graph.ndata["val_mask"] = val_mask
        self.graph.ndata["test_mask"] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

def train_dgi(graph, args, method, group, subgroup):
    features = torch.FloatTensor(graph.ndata["feat"])
    # print(features.shape)
    # labels = torch.LongTensor(graph.ndata["label"])
    if hasattr(torch, "BoolTensor"):
        train_mask = torch.BoolTensor(graph.ndata["train_mask"])
        val_mask = torch.BoolTensor(graph.ndata["val_mask"])
        test_mask = torch.BoolTensor(graph.ndata["test_mask"])
    else:
        train_mask = torch.ByteTensor(graph.ndata["train_mask"])
        val_mask = torch.ByteTensor(graph.ndata["val_mask"])
        test_mask = torch.ByteTensor(graph.ndata["test_mask"])
    in_feats = features.shape[1]
    # n_classes = data.num_classes
    n_edges = graph.num_edges()

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        # labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()

    # add self loop
    if args.self_loop:
        # print("self_loop")
        graph = dgl.remove_self_loop(graph)
        graph = dgl.add_self_loop(graph)
    n_edges = graph.num_edges()

    if args.gpu >= 0:
        graph = graph.to(args.gpu)
    # create DGI model
    dgi = DGI(
        graph,
        in_feats,
        args.n_hidden,
        args.n_layers,
        nn.PReLU(args.n_hidden),
        args.dropout,
    )

    if cuda:
        dgi.cuda()

    dgi_optimizer = torch.optim.Adam(
        dgi.parameters(), lr=args.dgi_lr, weight_decay=args.weight_decay
    )

    # train deep graph infomax
    cnt_wait = 0
    best = 1e9
    best_t = 0
    dur = []
    for epoch in range(args.n_dgi_epochs):
        dgi.train()
        if epoch >= 3:
            t0 = time.time()

        dgi_optimizer.zero_grad()
        loss = dgi(features)
        loss.backward()
        dgi_optimizer.step()

        if loss < best:
            best = loss
            best_t = epoch
            cnt_wait = 0
            torch.save(dgi.state_dict(), "best_dgi.pkl")
        else:
            cnt_wait += 1

        if cnt_wait == args.patience:
            print("Early stopping!")
            break

        if epoch >= 3:
            dur.append(time.time() - t0)

        """ print(
            "Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(
                epoch, np.mean(dur), loss.item(), n_edges / np.mean(dur) / 1000
            )
        ) """

    embeds = dgi.encoder(features, corrupt=False)
    embeds = embeds.cpu().detach()

    df_node_embeddings = pd.DataFrame(data=embeds)
    df_node_embeddings

    # save
    df_node_embeddings.to_csv("{}/output/{}/node_embeddings/node-embeddings_{}_{}_{}.csv".format(dir, exp, method, group, subgroup), index=True)
    # print("Save node embeddings")

time: 55.2 ms (started: 2023-10-01 23:04:46 -05:00)


In [21]:
nodes_data = pd.read_csv("{}/output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(dir, exp, groups_id[0], subgroups_id[groups_id[0]][0])).iloc[:, 2:]
edges_data = pd.read_csv("{}/output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(dir, exp, groups_id[0], subgroups_id[groups_id[0]][0]))

dataset = CustomDataset("g1", nodes_data, edges_data)
graph = dataset[0]

print(graph)

NameError: name 'exp' is not defined

time: 33.2 ms (started: 2023-10-20 13:41:59 -05:00)


In [20]:
# params

parser = argparse.ArgumentParser(description="DGI")
# register_data_args(parser)
parser.add_argument(
    "--dataset",
    type=str,
    default="cora",
    required=False,
    help="The input dataset. Can be cora, citeseer, pubmed, syn(synthetic dataset) or reddit",
)
parser.add_argument(
    "--dropout", type=float, default=0.0, help="dropout probability"
)
parser.add_argument("--gpu", type=int, default=0, help="gpu")
parser.add_argument(
    "--dgi-lr", type=float, default=1e-3, help="dgi learning rate"
)
parser.add_argument(
    "--classifier-lr",
    type=float,
    default=1e-2,
    help="classifier learning rate",
)
parser.add_argument(
    "--n-dgi-epochs",
    type=int,
    default=300,
    help="number of training epochs",
)
parser.add_argument(
    "--n-classifier-epochs",
    type=int,
    default=300,
    help="number of training epochs",
)
parser.add_argument(
    "--n-hidden", type=int, default=512, help="number of hidden gcn units"
)
parser.add_argument(
    "--n-layers", type=int, default=3, help="number of hidden gcn layers"
)
parser.add_argument(
    "--weight-decay", type=float, default=0.0, help="Weight for L2 loss"
)
parser.add_argument(
    "--patience", type=int, default=20, help="early stop patience condition"
)
parser.add_argument(
    "--self-loop",
    action="store_true",
    help="graph self-loop (default=False)",
)
parser.set_defaults(self_loop=True)
parser.set_defaults(n_hidden=dimension)
parser.set_defaults(n_layers=3)
args = parser.parse_args("")

print(args)

Namespace(dataset='cora', dropout=0.0, gpu=0, dgi_lr=0.001, classifier_lr=0.01, n_dgi_epochs=300, n_classifier_epochs=300, n_hidden=3, n_layers=3, weight_decay=0.0, patience=20, self_loop=True)
time: 73.2 ms (started: 2023-10-01 23:04:46 -05:00)


In [21]:
# get node embeddings
seed = 46
for option in options:
    if option:
        for group in groups_id:
            subgroups_id[group] = [option]
        torch.manual_seed(seed)
        np.random.seed(seed)
    else:
        torch.manual_seed(seed)
        np.random.seed(seed)
        pass
    print("Subgroups id:\t", subgroups_id)
    
    for group in tqdm(groups_id):
        for subgroup in tqdm(subgroups_id[group]):
            nodes_data = pd.read_csv("{}/output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(dir, exp, group, subgroup)).iloc[:, 2:]
            edges_data = pd.read_csv("{}/output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(dir, exp, group, subgroup))

            # read dataset
            # data = load_data(args)
            data = CustomDataset("g_{}_{}".format(group, subgroup), nodes_data, edges_data)
            graph = data[0]

            # train
            train_dgi(graph, args, method, group, subgroup)

Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


  0%|          | 0/3 [00:00<?, ?it/s]



Early stopping!


100%|██████████| 3/3 [00:11<00:00,  3.78s/it]
 33%|███▎      | 1/3 [00:11<00:22, 11.34s/it]

Early stopping!


100%|██████████| 2/2 [00:08<00:00,  4.07s/it]
 67%|██████▋   | 2/3 [00:19<00:09,  9.46s/it]

Early stopping!




Early stopping!




Early stopping!




Early stopping!




Early stopping!


100%|██████████| 5/5 [00:09<00:00,  1.94s/it]
100%|██████████| 3/3 [00:29<00:00,  9.74s/it]


Early stopping!
Subgroups id:	 {'pck1': ['str'], 'zwf1': ['str'], 'WT': ['str']}


100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
 33%|███▎      | 1/3 [00:01<00:03,  1.98s/it]

Early stopping!


100%|██████████| 1/1 [00:04<00:00,  4.15s/it]
 67%|██████▋   | 2/3 [00:06<00:03,  3.26s/it]

Early stopping!


100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
100%|██████████| 3/3 [00:10<00:00,  3.35s/it]


Early stopping!
Subgroups id:	 {'pck1': ['dyn'], 'zwf1': ['dyn'], 'WT': ['dyn']}


100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
 33%|███▎      | 1/3 [00:01<00:02,  1.29s/it]

Early stopping!


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
 67%|██████▋   | 2/3 [00:02<00:01,  1.44s/it]

Early stopping!


100%|██████████| 1/1 [00:03<00:00,  3.38s/it]
100%|██████████| 3/3 [00:06<00:00,  2.08s/it]

Early stopping!
time: 45.6 s (started: 2023-10-01 23:04:46 -05:00)





In [22]:
df_node_embeddings = pd.read_csv("{}/output/{}/node_embeddings/node-embeddings_{}_{}_{}.csv".format(dir, exp, method, groups_id[0], 
                                                                                                    subgroups_id[groups_id[0]][0]), index_col=0)
df_node_embeddings

Unnamed: 0,0,1,2
0,0.028216,0.038473,-0.030870
1,0.027113,0.040283,-0.031338
2,0.026278,0.041659,-0.031701
3,0.025606,0.042774,-0.032009
4,0.025040,0.043718,-0.032287
...,...,...,...
355,0.273044,-0.811768,-0.166852
356,0.293061,-0.884483,-0.178111
357,0.311482,-0.950744,-0.188288
358,0.349556,-1.089457,-0.209681


time: 14.3 ms (started: 2023-10-01 23:05:32 -05:00)
