# Graph Neural Network for clustering

In this notebook, we construct Graph Attention Autoencoder (GATE) architecture from paper *"Graph Attention Auto-Encoders"* to learn the low-dimension embeddings of products from its' time-series features that have been learned by LSTM Autoencoder.
- Graph input: graph built to connect products within subcategories group, we aim to capture the contrastive among those subcategories group.
- Clustering: apply KMeans on low-dimension embeddings of products, learned by GATE.

In [1]:
!pip install torch torch_geometric matplotlib

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m873.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Load embeddings data

In [1]:
import pandas as pd
import torch

embeddings_df = pd.read_csv('data/product_lstm_embeddings.csv')
embeddings_df.set_index('ProductID', inplace=True)

active_products = embeddings_df.index

embeddings_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
707,0.016641,-0.796348,-0.424462,1.146192,-0.341899,-2.166074,0.13477,-0.430464,0.148232,0.014255,...,0.099478,-0.618858,-0.049329,0.050487,0.334427,-0.214072,0.070194,1.170232,0.146769,2.693363
708,0.051593,-0.9022,-0.425973,1.068647,-0.327741,-2.268141,0.422152,-0.364571,0.046346,0.819576,...,0.178698,-0.603789,-0.016058,0.074833,0.315111,-0.254949,0.111341,1.161034,0.196277,2.804454
711,0.011487,-0.725108,-0.424914,1.100861,-0.250419,-2.179343,0.273201,-0.423647,0.102147,0.172975,...,0.096076,-0.5816,-0.055827,0.045036,0.243844,-0.218252,0.064921,1.213822,0.151609,2.70235
712,0.082089,-0.705805,-0.160028,0.621171,0.149819,-2.350944,1.628704,-0.105593,0.171692,1.460141,...,0.279286,-0.620832,0.003077,0.043463,-0.220858,-0.320969,0.189366,1.274447,0.312974,2.582912
713,0.680826,1.924417,-0.55674,0.518928,0.391095,-1.682862,-0.40002,-1.416731,-0.491055,-0.593432,...,-0.037703,0.073459,0.301301,0.201766,1.128518,-2.446335,0.414793,-0.942614,-1.503768,-1.502835


### 1. Construct product-product graph

In [2]:
from torch_geometric.utils import dense_to_sparse

dimProduct_df = pd.read_csv('data/dimProduct.csv')
dimProduct_df = dimProduct_df[dimProduct_df['ProductID'].isin(active_products)]

# Create adjacency matrix for products
# Two products are connected if they belong to the same subcategory

adjacency_matrix = pd.DataFrame(0, index=dimProduct_df['ProductID'], columns=dimProduct_df['ProductID'])

for subcategory, group in dimProduct_df.groupby('ProductSubcategoryID'):
    adjacency_matrix.loc[group['ProductID'], group['ProductID']] = 1

print(f"Adjacency matrix shape: {adjacency_matrix.shape} - Number of connections: {adjacency_matrix.values.sum()}")

# Convert to sparse matrix
adjacency_matrix = torch.tensor(adjacency_matrix.values, dtype=torch.float32)
edge_index, edge_weight = dense_to_sparse(adjacency_matrix)

edge_index

  from .autonotebook import tqdm as notebook_tqdm


Adjacency matrix shape: (166, 166) - Number of connections: 2050


tensor([[  0,   0,   0,  ..., 165, 165, 165],
        [  0,   1,   2,  ..., 163, 164, 165]])

## 2. Build GATE architecture

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.cluster import KMeans
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)

class GATEncoder(nn.Module):
    """
    Graph Attention Encoder (Simplified 1-Layer Version).
    Directly maps input features to latent space using attention.
    """
    def __init__(self, in_channels: int, hidden_channels: int, out_channels: int, heads=2, dropout=0.2):
        super(GATEncoder, self).__init__()

        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, concat=True, dropout=dropout)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=dropout)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, x, edge_index):
        # x: [num_nodes, in_channels]
        x = self.dropout(x)
        x = nn.functional.elu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        
        return x

class FeatureDecoder(nn.Module):
    """
    Reconstructs the original node features (RNN embeddings) from the latent Z.
    """
    def __init__(self, latent_channels: int, hidden_channels: int, out_channels: int):
        super(FeatureDecoder, self).__init__()
        # Simple MLP decoder
        self.lin1 = nn.Linear(latent_channels, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, out_channels)

    def forward(self, z):
        x = F.relu(self.lin1(z))
        x = self.lin2(x)
        return x

class GATEModel(nn.Module):
    """
    End-to-End Graph Attention Autoencoder.
    """
    def __init__(self, in_channels: int, hidden_channels: int, latent_channels: int):
        super(GATEModel, self).__init__()
        # Removed hidden_channels argument as we only have 1 layer
        self.encoder = GATEncoder(in_channels, hidden_channels, latent_channels)
        self.feature_decoder = FeatureDecoder(latent_channels, hidden_channels, in_channels)

    def forward(self, x: torch.tensor, edge_index: torch.tensor):
        z = self.encoder(x, edge_index)
        return z

    def recon_loss(self, z: torch.tensor, x: torch.tensor, edge_index: torch.tensor, lambda_feat=1, lambda_struct=0.1):
        """
        Dual Reconstruction Loss.
        """
        # 1. Feature Reconstruction Loss - Sum Squared Error
        x_hat = self.feature_decoder(z)
        loss_feat = F.mse_loss(x_hat, x)

        # 2. Structure Reconstruction Loss - Only backprop through connected edges, not unconnected ones
        adj_hat = torch.sigmoid(torch.matmul(z, z.t()))
        loss_struct = adj_hat[edge_index[0], edge_index[1]].mean()

        return (lambda_feat * loss_feat) + (lambda_struct * loss_struct), loss_feat.item(), loss_struct.item()

## 3. Training

In [44]:
from torch.optim import Adam, AdamW
from sklearn.preprocessing import StandardScaler

torch.manual_seed(42)

# Normalize embeddings
embeddings = embeddings_df.to_numpy()
embeddings = StandardScaler().fit_transform(embeddings)
embeddings = torch.tensor(embeddings, dtype=torch.float32)
print(f"Embeddings shape: {embeddings.shape}")


model = GATEModel(in_channels=embeddings.shape[1], hidden_channels=32, latent_channels=16)               # Latent channgels for embeddings, so set to 8
optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
edge_index = torch.tensor(edge_index, dtype=torch.int64)

print(f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} learnable parameters.")

Embeddings shape: torch.Size([166, 48])
Model has 6464 learnable parameters.


  edge_index = torch.tensor(edge_index, dtype=torch.int64)


In [45]:
# Training loop
NUM_EPOCHS = 3000
NOTIFY_EVERY = 200

model.train()
for epoch in range(NUM_EPOCHS):
    optimizer.zero_grad()
    z = model(embeddings, edge_index)
    loss, loss_feat, loss_struct = model.recon_loss(z, embeddings, edge_index)
    loss.backward()
    optimizer.step()
    if epoch % NOTIFY_EVERY == 0:
        print(f"Epoch {epoch}: Loss = {loss.item()}, Feature Loss = {loss_feat}, Structure Loss = {loss_struct}")

Epoch 0: Loss = 1.1422063112258911, Feature Loss = 1.0466008186340332, Structure Loss = 0.9560550451278687
Epoch 200: Loss = 0.6432552933692932, Feature Loss = 0.5476865768432617, Structure Loss = 0.9556871652603149
Epoch 400: Loss = 0.633344292640686, Feature Loss = 0.5392248630523682, Structure Loss = 0.9411945343017578
Epoch 600: Loss = 0.6157191395759583, Feature Loss = 0.521350622177124, Structure Loss = 0.9436852931976318
Epoch 800: Loss = 0.6156054139137268, Feature Loss = 0.5243307948112488, Structure Loss = 0.9127463698387146
Epoch 1000: Loss = 0.6031948328018188, Feature Loss = 0.5116922855377197, Structure Loss = 0.915025532245636
Epoch 1200: Loss = 0.6150552034378052, Feature Loss = 0.5226377248764038, Structure Loss = 0.9241746664047241
Epoch 1400: Loss = 0.6171069145202637, Feature Loss = 0.5260453820228577, Structure Loss = 0.9106153845787048
Epoch 1600: Loss = 0.6169811487197876, Feature Loss = 0.5235044360160828, Structure Loss = 0.9347671866416931
Epoch 1800: Loss = 0

### Get the low-dimension representation and perform embeddings

In [46]:
model.eval()
low_embeddings = model(embeddings, edge_index).detach().numpy()
low_embeddings.shape

(166, 16)

## C. Perform clustering

In [56]:
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import PCA

NUM_CLUSTERS = 4

x_compressed = PCA(n_components=15).fit_transform(embeddings_df.to_numpy())
predictions = KMeans(n_clusters=NUM_CLUSTERS, random_state=42).fit_predict(x_compressed)

for i in range(NUM_CLUSTERS):
    cluster_size = np.sum(predictions == i)
    print(f"Cluster {i}: {cluster_size} products")

# Per subcategory distribution
dimProduct_df['Cluster'] = predictions
dimProduct_df.groupby(['ProductSubcategoryID', 'Cluster']).size().unstack(fill_value=0)

Cluster 0: 37 products
Cluster 1: 79 products
Cluster 2: 29 products
Cluster 3: 21 products


  ret = a @ b
  ret = a @ b
  ret = a @ b


Cluster,0,1,2,3
ProductSubcategoryID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,3,11,6,0
2.0,3,0,14,1
3.0,16,2,4,0
4.0,0,5,0,0
5.0,0,2,0,0
6.0,0,2,0,0
7.0,0,1,0,0
8.0,0,2,0,0
9.0,0,2,0,0
12.0,0,13,2,0
