### Imports

In [9]:
import argparse
import os
import time

import dgl

from vgae import model
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn.functional as F
from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
# from input_data import load_data

import torch
from dgl.data import DGLDataset

from vgae.preprocess import (
    mask_test_edges,
    mask_test_edges_dgl,
    preprocess_graph,
    sparse_to_tuple,
)
from sklearn.metrics import average_precision_score, roc_auc_score

os.environ["DGLBACKEND"] = "pytorch"

time: 1.18 ms (started: 2023-10-23 13:47:46 -05:00)


In [10]:
from tqdm import tqdm
from utils.utils_go import *
from vgae.utils_vgae import *

import json
import pandas as pd
import networkx as nx
import numpy as np

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 13 ms (started: 2023-10-23 13:47:46 -05:00)


In [11]:
# torch.manual_seed(42)
# np.random.seed(42)

time: 417 µs (started: 2023-10-23 13:47:46 -05:00)


### Parameters

In [12]:
file = open("exp.json")
experiment = json.load(file)
exp_num = experiment["exp"]

file = open("output/{}/parameters.json".format(exp_num))
params = json.load(file)

exp = params["exp"]
print("Exp:\t\t", exp)

method = "dgi"
print("Method:\t\t", method)

data_variations = params["data_variations"]
print("Data variations:", data_variations)

dimension = params["dimension"]
print("Dimension:\t", dimension)

groups_id = params["groups_id"]
print("Groups id:\t", groups_id)

subgroups_id = params["subgroups_id"]
print("Subgroups id:\t", subgroups_id)

Exp:		 exp3
Method:		 dgi
Data variations: ['none']
Dimension:	 3
Groups id:	 ['pck1', 'zwf1', 'WT']
Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}
time: 4.84 ms (started: 2023-10-23 13:47:47 -05:00)


### Node embeddings

In [13]:
nodes_data = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, groups_id[0], subgroups_id[groups_id[0]][0])).iloc[:, 2:]
edges_data = pd.read_csv("output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(exp, groups_id[0], subgroups_id[groups_id[0]][0]))

dataset = CustomDatasetVGAE("g1", nodes_data, edges_data)
graph = dataset[0]

print(graph)

Graph(num_nodes=120, num_edges=6937,
      ndata_schemes={'feat': Scheme(shape=(24,), dtype=torch.float32), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
time: 15.8 ms (started: 2023-10-23 13:47:47 -05:00)


In [14]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

time: 228 µs (started: 2023-10-23 13:47:47 -05:00)


In [15]:
# get node embeddings
seed = 42
for data_variation in data_variations:
    if data_variation != "none":
        for group in groups_id:
            subgroups_id[group] = [data_variation]
        torch.manual_seed(seed)
        np.random.seed(seed)
    else:
        torch.manual_seed(seed)
        np.random.seed(seed)
        
    print("Subgroups id:\t", subgroups_id)
    
    for group in tqdm(groups_id):
        for subgroup in tqdm(subgroups_id[group]):
            nodes_data = pd.read_csv("output/{}/preprocessing/graphs_data/nodes_data_{}_{}.csv".format(exp, group, subgroup)).iloc[:, 2:]
            edges_data = pd.read_csv("output/{}/preprocessing/graphs_data/edges_data_{}_{}.csv".format(exp, group, subgroup))

            # read dataset
            # data = load_data(args)
            data = CustomDatasetVGAE("g_{}_{}".format(group, subgroup), nodes_data, edges_data)
            graph = data[0]

            # train
            args_ = args_vgae(dimension)
            train_vgae(exp, graph, args_, method, group, subgroup, 0)
            

Subgroups id:	 {'pck1': ['1', '2', '3'], 'zwf1': ['1', '2'], 'WT': ['1', '2', '3', '4', '5']}


  0%|          | 0/3 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 105.37it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:03<00:00, 94.48it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:02<00:00, 104.84it/s]
100%|██████████| 3/3 [00:09<00:00,  3.00s/it]
 33%|███▎      | 1/3 [00:09<00:18,  9.01s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:03<00:00, 99.40it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 300/300 [00:03<00:00, 95.67it/s]
100%|██████████| 2/2 [00:06<00:00,  3.11s/it]
 67%|██████▋   | 2/3 [00:15<0

time: 31.5 s (started: 2023-10-23 13:47:47 -05:00)





In [16]:
df_node_embeddings = pd.read_csv("output/{}/node_embeddings/node-embeddings_{}_{}_{}_{}.csv".format(exp, method, groups_id[0], subgroups_id[groups_id[0]][0], 0), index_col=0)
df_node_embeddings.head()

Unnamed: 0,0,1,2
0,1.14542,-2.337965,0.531753
1,0.514017,-0.4954,-1.047799
2,-0.160593,-0.605383,-0.268474
3,0.285081,0.334604,-0.310389
4,-0.421201,-0.111309,0.802733


time: 13.6 ms (started: 2023-10-23 13:48:18 -05:00)
