# Unsupervised training
Aca crearemos embeddings paa un grafo de Internet, es decir represenaciones de los SA a partir de la topologia y atributos de los SA.

Para esto tomamos 3 enfoques:


*   Caso 1: Reconstruction Approach autoencoder
*   Caso 2: Reconstruction Approach attribute Masking
*   Caso 3: Task Generation Pre-calculated descriptor

Para cada uno crearemos un ejemplo.


In [1]:
# Importar librerias y RIBs

%load_ext autoreload
%autoreload 2
from modules.gnn import GNN
import scipy.sparse as sp
import numpy as np


rib_path = 'data/sanitized_rib.txt'


  from .autonotebook import tqdm as notebook_tqdm


## Grafo NetworkX

In [22]:
import networkx as nx

# Crear un grafo dirigido (BGP usa rutas direccionales)
nx_graph = nx.DiGraph()

# Leer el archivo y agregar aristas
with open(rib_path, "r") as f:
    for line in f:
        nodos = list(map(int, line.strip().split("|")))  # Convertir a enteros
        edges = zip(nodos, nodos[1:])  # Crear pares consecutivos
        nx_graph.add_edges_from(edges)

# Imprimir información del grafo
print(f"Número de nodos: {nx_graph.number_of_nodes()}")
print(f"Número de aristas: {nx_graph.number_of_edges()}")

ValueError: invalid literal for int() with base 10: ''

## Grafo DGL

In [None]:
import dgl
import torch

src, dst = [], []

# Leer el archivo y extraer aristas
with open(rib_path, "r") as f:
    count_path = 0
    for line in f:
        count_path += 1
        line = line.strip()
        if not line:  # Ignorar líneas vacías
            continue
        try:
            nodos = list(map(int, line.split("|")))
            src.extend(nodos[:-1])  # Nodo de origen
            dst.extend(nodos[1:])   # Nodo de destino
        except ValueError as e:
            print(f"Error al procesar la línea: '{line}'. Error: {e}")
            continue
        if count_path == 500000: #FIXME: cambiar pormientras para debug
            break

# Crear el grafo dirigido en DGL
dgl_graph = dgl.graph((torch.tensor(src), torch.tensor(dst)))

# Imprimir información del grafo
print(dgl_graph)


Graph(num_nodes=394239, num_edges=1676722,
      ndata_schemes={}
      edata_schemes={})


## Encode-Decoder 

### Link Prediction

In [3]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.data import CoraGraphDataset
from dgl.dataloading import negative_sampler
import dgl.function as fn
from sklearn.metrics import roc_auc_score
from dgl.nn import SAGEConv, GraphConv, GATConv, GINConv, GatedGCNConv, GatedGraphConv

import numpy as np
import scipy.sparse as sp


In [4]:
from sklearn.metrics import roc_auc_score

def get_negative_edges(dgl_graph, num_neg_samples): #FIXME: optimizar
    """
    Genera aristas negativas para el grafo dado.
    """
    print(f"Generando {num_neg_samples} aristas negativas...")
    neg_src_u = []
    neg_dst_v = []
    num_nodes = dgl_graph.num_nodes()

    for i in range(num_neg_samples):
        src = np.random.randint(0, num_nodes)
        dst = np.random.randint(0, num_nodes)
        while dgl_graph.has_edges_between(src, dst):
            src = np.random.randint(0, num_nodes)
            dst = np.random.randint(0, num_nodes)
        neg_src_u.append(src)
        neg_dst_v.append(dst)

        if i % 500000 == 0:
            print(f"Aristas negativas generadas: {i}")

    return torch.tensor(neg_src_u), torch.tensor(neg_dst_v)

def compute_auc(pos_score,neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [5]:
# Modelo GCN

class GCN(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, hidden_feats)
        self.conv2 = GraphConv(hidden_feats, out_feats)
    
    def encode(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
    def decode(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            scores = g.edata['score']
            return scores

    def decode_all(self, z):
        return (z @ z.T) > 0
    
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super(GraphSAGE).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_feats)
        self.conv2 = SAGEConv(hidden_feats, out_feats)
    
    def encode(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
    def decode(self, in_feat, edge_index):
        return (in_feat[edge_index[0]] * in_feat[edge_index[1]]).sum(dim=-1)
    
    def decode_all(self, z):
        return (z @ z.T) > 0

In [6]:

u,v = dgl_graph.edges()

# IDs de lo edges
eids = np.arange(dgl_graph.num_edges()) 
# Shuffle the edges
eids = np.random.permutation(eids)

# Tamaño de train y test
test_size = int(len(eids) * 0.1) 
train_size = dgl_graph.num_edges() - test_size 

# Selecciona los edges de test y train
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Matriz de adyacencia
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))

# Generar aristas negativas 
# neg_u, neg_v = get_negative_edges(dgl_graph, dgl_graph.num_edges())


# # Matriz de adyacencia negativa
# adj_neg = 1 - adj.todense() - np.eye(dgl_graph.num_nodes())

# # IDs de lso edges negativos
# neg_u, neg_v = np.where(adj_neg != 0) 

# # Selecciono nodos aleatorios (misma cantidad que el numero de edges positivos)
# neg_eids = np.random.choice(len(neg_u), dgl_graph.num_edges())

# test_neg_u, test_neg_v = (
#     neg_u[neg_eids[:test_size]],
#     neg_v[neg_eids[:test_size]],
# )
# train_neg_u, train_neg_v = (
#     neg_u[neg_eids[test_size:]],
#     neg_v[neg_eids[test_size:]],
# )

# train_g = dgl.remove_edges(dgl_graph, eids[:test_size])



In [None]:
neg_u, neg_v = get_negative_edges(dgl_graph, dgl_graph.num_edges())
#  167.564 -> 30 seg  -> 
#  334.996 -> 50 seg -> 100.000 paths
#1.676.722 -> 4 min 17 seg -> 500.000 paths


Generando 1676722 aristas negativas...
Aristas negativas generadas: 0
Aristas negativas generadas: 500000
Aristas negativas generadas: 1000000
Aristas negativas generadas: 1500000


In [8]:
test_neg_u, test_neg_v = (
    neg_u[:test_size],
    neg_v[:test_size],
)

train_neg_u, train_neg_v = (
    neg_u[test_size:],
    neg_v[test_size:],
)


In [9]:
# Agregar features aleatorias a los nodos FIXME: cambiar por features reales
dgl_graph.ndata['feat'] = torch.ones(dgl_graph.num_nodes(), 128)
print(dgl_graph.ndata['feat'].shape)

torch.Size([394239, 128])


In [10]:
# Eliminar edges de test
train_g = dgl.remove_edges(dgl_graph, eids[:test_size])
train_g

Graph(num_nodes=394239, num_edges=1509050,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32)}
      edata_schemes={})

In [11]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=dgl_graph.num_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=dgl_graph.num_nodes())    

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=dgl_graph.num_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=dgl_graph.num_nodes())

In [12]:

in_feats = dgl_graph.ndata["feat"].shape[1]
print(f"Input features: {in_feats}")
hidden_feats = 16
out_feats = 16
model = GCN( in_feats,hidden_feats, out_feats)

# Agregar self-loops al grafo de entrenamiento
train_g = dgl.add_self_loop(train_g) #TODO: Investigar sobre esta practica

# ------ Set up loss y optimizer ------

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


# ------ Set up loss y optimizer ------
epochs = 100
for epoch in range(epochs):
    # Forward pass
    model.train()

    # Calcular embeddings
    h = model.encode(train_g, train_g.ndata["feat"])
    
    # Valor a aristas
    pos_score = model.decode(train_pos_g, h)
    neg_score = model.decode(train_neg_g, h)

    

    # loss
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    # Asegurar que labels tenga la misma forma que scores
    labels = labels.view(-1, 1)  # Expandir dimensiones FIXME:

    loss = F.binary_cross_entropy_with_logits(scores, labels)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


with torch.no_grad():

    pos_score = model.decode(test_pos_g, h)
    neg_score = model.decode(test_neg_g, h)
    
    print(f"AUC",compute_auc(pos_score, neg_score))

Input features: 128
Epoch 1/100, Loss: 8.374835968017578
Epoch 2/100, Loss: 3.7176010608673096
Epoch 3/100, Loss: 1.881535291671753
Epoch 4/100, Loss: 0.8457967638969421
Epoch 5/100, Loss: 0.6943091154098511
Epoch 6/100, Loss: 0.6931784749031067
Epoch 7/100, Loss: 0.6931965351104736
Epoch 8/100, Loss: 0.6932178139686584
Epoch 9/100, Loss: 0.6932422518730164
Epoch 10/100, Loss: 0.6932687163352966
Epoch 11/100, Loss: 0.6932965517044067
Epoch 12/100, Loss: 0.6933256983757019
Epoch 13/100, Loss: 0.6933552026748657
Epoch 14/100, Loss: 0.6933846473693848
Epoch 15/100, Loss: 0.6934139728546143
Epoch 16/100, Loss: 0.6934426426887512
Epoch 17/100, Loss: 0.6934702396392822
Epoch 18/100, Loss: 0.6934968829154968
Epoch 19/100, Loss: 0.6935219764709473
Epoch 20/100, Loss: 0.6935461759567261
Epoch 21/100, Loss: 0.6935684680938721
Epoch 22/100, Loss: 0.6935895681381226
Epoch 23/100, Loss: 0.6936091184616089
Epoch 24/100, Loss: 0.6936269402503967
Epoch 25/100, Loss: 0.6936432123184204
Epoch 26/100, Lo

In [13]:
import pandas as pd

# Guaradar el modelo
torch.save(model.state_dict(), 'model.pth')

# Guardar los embeddings de los nodos después del entrenamiento
with torch.no_grad():
    # Calcular los embeddings finales
    final_embeddings = model.encode(train_g, train_g.ndata["feat"]).detach().cpu().numpy()

    # Crear un DataFrame para guardar los embeddings
    # Obtener ids de los nodos (ASN)
    node_ids = train_g.nodes().numpy()

    emb_df = pd.DataFrame(final_embeddings, index=node_ids)
    emb_df.index.name = "node_id"

    # Guardar en un archivo CSV
    emb_df.to_csv("node_embeddings.csv")
    np.save("node_embeddings.npy", final_embeddings)
    print("Embeddings guardados en 'node_embeddings.csv'")

emb_df

Embeddings guardados en 'node_embeddings.csv'


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
1,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
2,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
3,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
4,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394234,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
394235,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
394236,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136
394237,0.090226,0.05271,0.031951,-0.037414,0.081488,-0.04819,-0.035787,0.00712,-0.076475,0.031041,-0.036488,-0.063112,-0.028331,0.080137,-0.068534,-0.053136


In [14]:
# Cargar embeddings
embeddings = np.load("node_embeddings.npy")
print("[SHAPE]",embeddings.shape)
print("[EMBEDDING]",embeddings[0])

[SHAPE] (394239, 16)
[EMBEDDING] [ 0.09022641  0.0527095   0.03195057 -0.03741408  0.08148827 -0.04818971
 -0.03578674  0.00711954 -0.07647453  0.03104107 -0.0364881  -0.06311219
 -0.02833056  0.0801375  -0.06853358 -0.05313587]


Cosas a tener consideraciom:
- GNN al crear un grafo, crea tdoos los nodos que se encuentran en el rango entregado. De esta forma pueden quedar nodos aislados, sin embargo ento causa que no se puedan ir 'actualizando' pero al agregar self_loop se actualizan con si mismos y supondremos que con ello algun patron para este tipo de nodos.
- La entrega de embeddings finales esta en orden el ASN de valor inferios hasta el ASN de valor maximo.
- Lo guardamso en un .csv dondde se asocia 'node_id' con su embeddings y donde node_id es el ASN del Sistema Autonomo. 

In [15]:
train_g.ndata['feat'] 

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

# TODO:
Primero:
* Crear grafo GNN a partir de AS rank y los atributos de ese otro dataset de antes.
* Con esa informacion entrenar los embedings y luego pasarle esos embedigs a a la clasificacion

Segundo:
* Probra crear grafo con archivos oix 
* Luego entrenar para clasificacion

Tercero:
* Copiar la Red Neuronal que ocupa BGP2Vec
* Probar con los embeddings anteriores probar la clasificacion. 
Si hay buenos resultados significa que nuestra MLP es muy basica/simple/poco compleja para atrapar las relaciones, identificar patrones. 
