In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
!pip install -q git+https://github.com/rusty1s/pytorch_geometric.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import networkx as nx

import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric import seed_everything
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder


In [3]:
#ESTO ES PARA EJECUTAR EN COLAB
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Numero de features
features=list(G7.nodes(data=True))[0][1]
print(features)
num_features = len(features['feature'])
print('Numero de atributos:',num_features)

In [5]:
# Ruta donde guardaste el archivo gpickle
ruta_guardado = '/content/drive/My Drive/Colab Notebooks/GRAFO RETO/grafo.pkl'

# Carga el grafo desde el archivo gpickle
with open(ruta_guardado, 'rb') as file:
    G7 = pickle.load(file)

In [6]:
# Obtener nodos y características de nodos del grafo
nodos = list(G7.nodes)
conexion = list(G7.edges)
caracteristicas_nodos = [G7.nodes[nodo] for nodo in nodos]

In [7]:
df_caracteristicas_nodos = pd.DataFrame(caracteristicas_nodos, index=nodos)
df_caracteristicas_nodos.head()

Unnamed: 0,feature,label
0,"[7, B11730, warm, 1.1, Formal, 1, smooth, stre...",0
1,"[7, ECE9D6, warm, 3.0, Informal, 2, horizontal...",1
2,"[7, 5F5E5E, warm, 2.0, Informal, 1, smooth, cl...",2
3,"[7, 00008b, warm, 2.0, Informal, 1, horizontal...",3
4,"[7, 000000, cold, 3.1, Informal, 3, sheets, ca...",4


In [8]:
caracteristicas_df = df_caracteristicas_nodos['feature'].apply(pd.Series)
caracteristicas_df.columns = ['season','color','weather','subnivel','formalidad','adventurous','estampado','estilo','fit','application']
caracteristicas_df.head()

Unnamed: 0,season,color,weather,subnivel,formalidad,adventurous,estampado,estilo,fit,application
0,7,B11730,warm,1.1,Formal,1,smooth,street,tight,freetime
1,7,ECE9D6,warm,3.0,Informal,2,horizontal_stripes,classic,straight,work
2,7,5F5E5E,warm,2.0,Informal,1,smooth,classic,straight,freetime
3,7,00008b,warm,2.0,Informal,1,horizontal_stripes,casual,straight,freetime
4,7,000000,cold,3.1,Informal,3,sheets,casual,loose,freetime


In [9]:
df_caracteristicas_nodos = pd.concat([df_caracteristicas_nodos.drop(columns=['feature']), caracteristicas_df], axis=1)
df_caracteristicas_nodos = df_caracteristicas_nodos.drop(columns=['season'])
df_caracteristicas_nodos.head()

Unnamed: 0,label,color,weather,subnivel,formalidad,adventurous,estampado,estilo,fit,application
0,0,B11730,warm,1.1,Formal,1,smooth,street,tight,freetime
1,1,ECE9D6,warm,3.0,Informal,2,horizontal_stripes,classic,straight,work
2,2,5F5E5E,warm,2.0,Informal,1,smooth,classic,straight,freetime
3,3,00008b,warm,2.0,Informal,1,horizontal_stripes,casual,straight,freetime
4,4,000000,cold,3.1,Informal,3,sheets,casual,loose,freetime


In [10]:
df_caracteristicas_nodos.dtypes

Unnamed: 0,0
label,int64
color,object
weather,object
subnivel,float64
formalidad,object
adventurous,int64
estampado,object
estilo,object
fit,object
application,object


In [11]:
for col in df_caracteristicas_nodos.columns:
    print(f'{col}:{len(df_caracteristicas_nodos[col].unique())}')

label:1923
color:38
weather:2
subnivel:6
formalidad:2
adventurous:5
estampado:25
estilo:6
fit:5
application:5


In [13]:
categoricas = df_caracteristicas_nodos.select_dtypes(exclude=['int']).columns
categoricas

Index(['color', 'weather', 'subnivel', 'formalidad', 'estampado', 'estilo',
       'fit', 'application'],
      dtype='object')

In [14]:
# Crear un LabelEncoder para cada columna categórica
df_codificado = pd.get_dummies(df_caracteristicas_nodos, columns=categoricas,drop_first=True)
df_codificado.head()
columnas = df_codificado.columns
df_codificado[columnas] = df_codificado[columnas].astype(int)
df_codificado.head()

Unnamed: 0,label,adventurous,color_000081,color_00008b,color_0000FF,color_008000,color_153668,color_164A0A,color_40E0D0,color_462C0E,...,estilo_minimal,estilo_night,estilo_street,fit_oversize,fit_straight,fit_tight,application_night,application_special_occasion,application_work,application_working_girl
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,3,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df_codificado.values

array([[   0,    1,    0, ...,    0,    0,    0],
       [   1,    2,    0, ...,    0,    1,    0],
       [   2,    1,    0, ...,    0,    0,    0],
       ...,
       [1920,    3,    0, ...,    0,    0,    0],
       [1921,    1,    0, ...,    0,    0,    0],
       [1922,    1,    0, ...,    0,    0,    0]])

In [16]:
x_tensor = torch.tensor(df_codificado.drop(columns=['label']).values, dtype=torch.float32)
x_tensor.shape

torch.Size([1923, 80])

In [17]:
x_tensor

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [2., 0., 0.,  ..., 0., 1., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [3., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.]])

In [18]:
edges=list(G7.edges)
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
print(edge_index,edge_index.shape)

tensor([[   0,    0,    0,  ..., 1908, 1908, 1914],
        [  53,   68,   91,  ..., 1910, 1912, 1917]]) torch.Size([2, 150028])


In [19]:
G7_data=Data(x=x_tensor, edge_index=edge_index)
G7_data

Data(x=[1923, 80], edge_index=[2, 150028])

In [20]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):  # Forward pass para obtener los embeddings
        x = self.conv1(x, edge_index).relu()  # Primera capa GCN
        x = self.conv2(x, edge_index)         # Segunda capa GCN
        return x

    def decode(self, z, edge_label_index):
        # Producto escalar entre los embeddings de los nodos de cada arista
        edge_embeddings = (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)
        return edge_embeddings

    def decode_all(self, z):
        # Decodificar todos los posibles enlaces (producto escalar de todos los nodos)
        adjacency_scores = z @ z.T
        return adjacency_scores



In [22]:
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling
def train_link_predictor(
    model, train_data, optimizer, criterion, n_epochs=100
):
    for epoch in range(1, n_epochs + 1):
        model.train()
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)

        # Muestreo de negativos por cada época de entrenamiento
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), method='sparse'
        )

        edge_label_index = torch.cat(
            [train_data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)

        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        val_auc = eval_link_predictor(model, train_data)

        if epoch % 10 == 0:
            print(f"Epoch: {epoch:03d}, Train Loss: {loss:.3f}, Val AUC: {val_auc:.3f}")

    return model


@torch.no_grad()
def eval_link_predictor(model, data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()

    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())


In [27]:
from torch_geometric.transforms import RandomLinkSplit

# Realizar el split en entrenamiento, validación y prueba
split = RandomLinkSplit(
    num_val=0.05,               # 5% de los enlaces se usarán para validación
    num_test=0.1,               # 10% de los enlaces se usarán para prueba
    is_undirected=True,         # Indicar si el grafo es no dirigido
    add_negative_train_samples=False,  # No añadir ejemplos negativos automáticamente
    neg_sampling_ratio=1.0      # Proporción de muestras negativas para el entrenamiento
)

train_data, val_data, test_data = split(G7_data)

In [35]:
# Parámetros del modelo
in_channels = 80  # Número de características por nodo
hidden_channels = 64  # Número de neuronas en la capa oculta
out_channels = 32  # Número de dimensiones para los embeddings

# Crear el modelo
model = Net(in_channels, hidden_channels, out_channels)

# Optimización
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Entrenar el modelo
train_link_predictor(model, train_data, optimizer, criterion, n_epochs=100)




Epoch: 010, Train Loss: 0.671, Val AUC: nan




Epoch: 020, Train Loss: 0.655, Val AUC: nan




Epoch: 030, Train Loss: 0.642, Val AUC: nan




Epoch: 040, Train Loss: 0.632, Val AUC: nan




Epoch: 050, Train Loss: 0.626, Val AUC: nan




Epoch: 060, Train Loss: 0.624, Val AUC: nan




Epoch: 070, Train Loss: 0.620, Val AUC: nan




Epoch: 080, Train Loss: 0.620, Val AUC: nan




Epoch: 090, Train Loss: 0.618, Val AUC: nan




Epoch: 100, Train Loss: 0.618, Val AUC: nan




Net(
  (conv1): GCNConv(80, 30)
  (conv2): GCNConv(30, 30)
)

In [36]:
val_auc = eval_link_predictor(model, val_data)
print(f"Validation AUC: {val_auc:.3f}")


Validation AUC: 0.753


In [37]:
test_auc = eval_link_predictor(model, test_data)
print(f"Test AUC: {test_auc:.3f}")


Test AUC: 0.747


PRUEBA DIFERENTES HIPERPARAMETROS

In [38]:
import torch
from torch.optim import Adam
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling
from torch_geometric.data import Data

# Definir una función para probar diferentes combinaciones de hiperparámetros
def grid_search(model_class, graph, learning_rates, hidden_dimensions, embedding_dimensions, n_epochs=100):
    best_model = None
    best_auc = 0
    best_params = {}

    for lr in learning_rates:
        for hidden_dim in hidden_dimensions:
            for embedding_dim in embedding_dimensions:

                # Crear modelo
                model = model_class(in_channels=graph.num_features, hidden_channels=hidden_dim, out_channels=embedding_dim)

                # Definir el optimizador y la función de pérdida
                optimizer = Adam(model.parameters(), lr=lr)
                criterion = torch.nn.BCEWithLogitsLoss()

                # Realizar split de datos
                split = RandomLinkSplit(
                    num_val=0.05,
                    num_test=0.1,
                    is_undirected=True,
                    add_negative_train_samples=False,
                    neg_sampling_ratio=1.0,
                )
                train_data, val_data, test_data = split(graph)

                # Entrenar el modelo
                model = train_link_predictor(model, train_data, optimizer, criterion, n_epochs)

                # Evaluar el modelo
                val_auc = eval_link_predictor(model, val_data)

                # Si el AUC de validación es mejor, actualizar el mejor modelo y parámetros
                if val_auc > best_auc:
                    best_auc = val_auc
                    best_model = model
                    best_params = {'lr': lr, 'hidden_dim': hidden_dim, 'embedding_dim': embedding_dim}

                # Mostrar el resultado para la combinación actual de parámetros
                print(f"LR: {lr}, Hidden Dim: {hidden_dim}, Embedding Dim: {embedding_dim}, Val AUC: {val_auc:.3f}")

      # Ordenar los resultados por AUC de mejor a peor
    results = sorted(results, key=lambda x: x['val_auc'], reverse=True)

    # Mostrar los resultados ordenados
    print("\nResultados ordenados de mejor a peor:")
    for idx, result in enumerate(results):
        print(f"Rank {idx + 1}: LR={result['lr']}, Hidden Dim={result['hidden_dim']}, Embedding Dim={result['embedding_dim']}, Val AUC={result['val_auc']:.3f}")

    return results

In [39]:
# Define los rangos de los hiperparámetros
learning_rates = [0.1, 0.01, 0.001]
hidden_dimensions = [32, 64, 128]
embedding_dimensions = [32, 64, 128]

# Llamar al grid search
best_model, best_params = grid_search(Net, G7_data, learning_rates, hidden_dimensions, embedding_dimensions)



Epoch: 010, Train Loss: 0.836, Val AUC: nan




Epoch: 020, Train Loss: 0.706, Val AUC: nan




Epoch: 030, Train Loss: 0.695, Val AUC: nan




Epoch: 040, Train Loss: 0.695, Val AUC: nan




Epoch: 050, Train Loss: 0.694, Val AUC: nan




Epoch: 060, Train Loss: 0.693, Val AUC: nan




Epoch: 070, Train Loss: 0.693, Val AUC: nan




Epoch: 080, Train Loss: 0.693, Val AUC: nan




Epoch: 090, Train Loss: 0.693, Val AUC: nan




Epoch: 100, Train Loss: 0.693, Val AUC: nan
LR: 0.1, Hidden Dim: 32, Embedding Dim: 32, Val AUC: 0.500




Epoch: 010, Train Loss: 0.917, Val AUC: nan




Epoch: 020, Train Loss: 0.701, Val AUC: nan




Epoch: 030, Train Loss: 0.701, Val AUC: nan




Epoch: 040, Train Loss: 0.695, Val AUC: nan




Epoch: 050, Train Loss: 0.694, Val AUC: nan




Epoch: 060, Train Loss: 0.693, Val AUC: nan




Epoch: 070, Train Loss: 0.693, Val AUC: nan




Epoch: 080, Train Loss: 0.693, Val AUC: nan




Epoch: 090, Train Loss: 0.693, Val AUC: nan




Epoch: 100, Train Loss: 0.693, Val AUC: nan
LR: 0.1, Hidden Dim: 32, Embedding Dim: 64, Val AUC: 0.500




Epoch: 010, Train Loss: 1.329, Val AUC: nan




Epoch: 020, Train Loss: 0.729, Val AUC: nan




Epoch: 030, Train Loss: 0.711, Val AUC: nan




Epoch: 040, Train Loss: 0.698, Val AUC: nan




Epoch: 050, Train Loss: 0.695, Val AUC: nan




Epoch: 060, Train Loss: 0.694, Val AUC: nan




Epoch: 070, Train Loss: 0.693, Val AUC: nan




Epoch: 080, Train Loss: 0.693, Val AUC: nan




Epoch: 090, Train Loss: 0.693, Val AUC: nan




Epoch: 100, Train Loss: 0.693, Val AUC: nan
LR: 0.1, Hidden Dim: 32, Embedding Dim: 128, Val AUC: 0.500




KeyboardInterrupt: 