# Unsupervised training
Aca crearemos embeddings paa un grafo de Internet, es decir represenaciones de los SA a partir de la topologia y atributos de los SA.

Para esto tomamos 3 enfoques:


*   Caso 1: Reconstruction Approach autoencoder
*   Caso 2: Reconstruction Approach attribute Masking
*   Caso 3: Task Generation Pre-calculated descriptor

Para cada uno crearemos un ejemplo.


In [1]:
# Importar librerias y RIBs

from modules.gnn import GNN
import scipy.sparse as sp

rib_path = 'sanitized_rib.txt'


  from .autonotebook import tqdm as notebook_tqdm


# Grafo NetworkX

In [2]:
import networkx as nx

# Crear un grafo dirigido (BGP usa rutas direccionales)
nx_graph = nx.DiGraph()

# Leer el archivo y agregar aristas
with open(rib_path, "r") as f:
    for line in f:
        nodos = list(map(int, line.strip().split("|")))  # Convertir a enteros
        edges = zip(nodos, nodos[1:])  # Crear pares consecutivos
        nx_graph.add_edges_from(edges)

# Imprimir información del grafo
print(f"Número de nodos: {nx_graph.number_of_nodes()}")
print(f"Número de aristas: {nx_graph.number_of_edges()}")

Número de nodos: 42887
Número de aristas: 367839


# Grafo DGL

In [8]:
import dgl
import torch

src, dst = [], []

# Leer el archivo y extraer aristas
with open(rib_path, "r") as f:
    for line in f:
        line = line.strip()
        if not line:  # Ignorar líneas vacías
            continue
        try:
            nodos = list(map(int, line.split("|")))
            src.extend(nodos[:-1])  # Nodo de origen
            dst.extend(nodos[1:])   # Nodo de destino
        except ValueError as e:
            print(f"Error al procesar la línea: '{line}'. Error: {e}")
            continue

# Crear el grafo dirigido en DGL
dgl_graph = dgl.graph((torch.tensor(src), torch.tensor(dst)))

# Imprimir información del grafo
print(dgl_graph)


Graph(num_nodes=394239, num_edges=80557668,
      ndata_schemes={}
      edata_schemes={})


# Encode-Decoder 

# Link Prediction

In [9]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.data import CoraGraphDataset
from dgl.dataloading import negative_sampler
import dgl.function as fn
from sklearn.metrics import roc_auc_score
from dgl.nn import SAGEConv, GraphConv, GATConv, GINConv, GatedGCNConv, GatedGraphConv

import numpy as np
import scipy.sparse as sp


In [10]:
# Modelo GCN

class GCN(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, hidden_feats)
        self.conv2 = GraphConv(hidden_feats, out_feats)
    
    def encode(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
    def decode(self, in_feat, edge_index):
        return (in_feat[edge_index[0]] * in_feat[edge_index[1]]).sum(dim=-1)
    
    def decode_all(self, z):
        return (z @ z.T) > 0

In [11]:
def split_data_link_pred(dgl_graph, test_size=0.1):
    u,v = dgl_graph.edges()

    # IDs de lo edges
    eids = np.arange(dgl_graph.num_edges()) 
    # Shuffle the edges
    eids = np.random.permutation(eids)

    # Tamaño de train y test
    test_size = int(len(eids) * 0.1) 
    train_size = dgl_graph.num_edges() - test_size 

    # Selecciona los edges de test y train
    test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
    train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

    # Matriz de adyacencia
    adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
    # Matriz de adyacencia negativa
    adj_neg = 1 - adj.todense() - np.eye(dgl_graph.num_nodes())

    # IDs de lso edges negativos
    neg_u, neg_v = np.where(adj_neg != 0) 

    # Selecciono nodos aleatorios (misma cantidad que el numero de edges positivos)
    neg_eids = np.random.choice(len(neg_u), dgl_graph.num_edges())

    test_neg_u, test_neg_v = (
        neg_u[neg_eids[:test_size]],
        neg_v[neg_eids[:test_size]],
    )
    train_neg_u, train_neg_v = (
        neg_u[neg_eids[test_size:]],
        neg_v[neg_eids[test_size:]],
    )

    train_g = dgl.remove_edges(dgl_graph, eids[:test_size])
    
    return  (train_pos_u, train_pos_v) , (test_pos_u, test_pos_v), (train_neg_u, train_neg_v), (test_neg_u, test_neg_v), train_g

In [13]:
import numpy as np
import scipy.sparse as sp
import dgl

def split_data_link_pred(dgl_graph, test_size=0.1):
    """
    Divide los datos del grafo para la predicción de enlaces en conjuntos de entrenamiento y prueba.

    Parameters:
        - dgl_graph: Grafo de DGL.
        - test_size: Proporción del conjunto de prueba.

    Returns:
        - (train_pos_u, train_pos_v): Aristas positivas de entrenamiento.
        - (test_pos_u, test_pos_v): Aristas positivas de prueba.
        - (train_neg_u, train_neg_v): Aristas negativas de entrenamiento.
        - (test_neg_u, test_neg_v): Aristas negativas de prueba.
        - train_g: Subgrafo de entrenamiento.
    """
    # Obtener las aristas del grafo
    u, v = dgl_graph.edges()

    # Crear la matriz de adyacencia dispersa
    adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(dgl_graph.num_nodes(), dgl_graph.num_nodes()))

    # Crear la matriz de adyacencia negativa dispersa
    adj_neg = sp.coo_matrix(1 - adj.toarray() - sp.eye(dgl_graph.num_nodes()))

    # IDs de las aristas negativas
    neg_u, neg_v = adj_neg.nonzero()

    # Dividir las aristas positivas en entrenamiento y prueba
    num_test = int(len(u) * test_size)
    num_train = len(u) - num_test

    perm = np.random.permutation(len(u))
    train_pos_u, train_pos_v = u[perm[:num_train]], v[perm[:num_train]]
    test_pos_u, test_pos_v = u[perm[num_train:]], v[perm[num_train:]]

    # Dividir las aristas negativas en entrenamiento y prueba
    perm_neg = np.random.permutation(len(neg_u))
    train_neg_u, train_neg_v = neg_u[perm_neg[:num_train]], neg_v[perm_neg[:num_train]]
    test_neg_u, test_neg_v = neg_u[perm_neg[num_train:num_train + num_test]], neg_v[perm_neg[num_train:num_train + num_test]]

    # Crear el subgrafo de entrenamiento
    train_g = dgl.remove_edges(dgl_graph, perm[num_train:])

    return (train_pos_u, train_pos_v), (test_pos_u, test_pos_v), (train_neg_u, train_neg_v), (test_neg_u, test_neg_v), train_g

In [15]:
data_path = 'data/DGL_Graph/DiGraph_AllFeatures/'
force_reload = True
dgl_graph = dgl.data.CSVDataset(data_path, force_reload=False)[0]

(train_pos_u, train_pos_v) , (test_pos_u, test_pos_v), (train_neg_u, train_neg_v), (test_neg_u, test_neg_v), train_g = split_data_link_pred(dgl_graph, test_size=0.1)



Done loading data from cached files.


MemoryError: Unable to allocate 41.0 GiB for an array with shape (74145, 74145) and data type float64

Done saving data into cached files.
Graph(num_nodes=74145, num_edges=461889,
      ndata_schemes={'feat': Scheme(shape=(72,), dtype=torch.float32)}
      edata_schemes={'Relationship': Scheme(shape=(), dtype=torch.int64)})
Training edges: 369389
Validation edges: 46486
Test edges: 46014


In [None]:

# Cargar el dataset Cora
data = CoraGraphDataset()
g = data[0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
g = g.to(device)

# Agregar atributos de nodos y aristas
g.ndata['feat'] = g.ndata['feat'].to(device)

# Dividir datos en entrenamiento, validación y prueba
def split_edges(g, val_ratio=0.05, test_ratio=0.1):
    u, v = g.edges()
    eids = torch.randperm(g.number_of_edges())
    num_val = int(len(eids) * val_ratio)
    num_test = int(len(eids) * test_ratio)
    num_train = len(eids) - num_val - num_test
    
    train_eids = eids[:num_train]
    val_eids = eids[num_train:num_train+num_val]
    test_eids = eids[num_train+num_val:]
    
    train_g = dgl.remove_edges(g, val_eids.tolist() + test_eids.tolist())
    
    return train_g, (u[val_eids], v[val_eids]), (u[test_eids], v[test_eids])

train_g, val_edges, test_edges = split_edges(g)


model = GCN(g.ndata['feat'].shape[1], 128, 64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

def train():
    model.train()
    optimizer.zero_grad()
    
    z = model.encode(train_g, train_g.ndata['feat'])
    
    neg_u, neg_v = negative_sampler.GlobalUniform()(train_g, len(val_edges[0]))
    
#     edge_index = torch.cat([
#         torch.stack(val_edges, dim=0),
#         torch.stack([neg_u, neg_v], dim=0)
#     ], dim=-1)
    
#     edge_labels = torch.cat([
#         torch.ones(val_edges[0].shape[0], device=device),
#         torch.zeros(neg_u.shape[0], device=device)
#     ])
    
#     out = model.decode(z, edge_index).view(-1)
#     loss = criterion(out, edge_labels)
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# def test(edges):
#     model.eval()
#     z = model.encode(g, g.ndata['feat'])
#     out = model.decode(z, torch.stack(edges, dim=0)).view(-1).sigmoid()
#     return roc_auc_score(torch.ones(len(edges[0]), device=device).cpu().numpy(), out.cpu().numpy())

# best_val_auc, final_test_auc = 0, 0

# for epoch in range(1, 101):
#     loss = train()
#     val_auc = test(val_edges)
#     test_auc = test(test_edges)
#     if val_auc > best_val_auc:
#         best_val_auc = val_auc
#         final_test_auc = test_auc
#     print(f'Epoch {epoch}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, Test AUC: {test_auc:.4f}')

# print(f'Final Test AUC: {final_test_auc:.4f}')


  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
