In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

from dataset import create_QM9_nx_datasets
from graph2vec import Graph2Vec


In [2]:
train_graphs_subset, val_graphs_subset, test_graphs_subset, \
train_pyg_subset, val_pyg_subset, test_pyg_subset = create_QM9_nx_datasets(
    subset_size=None,
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    random_seed=42
)

Original dataset size: 130831
Train NX graphs: 104664, Train PyG graphs: 104664
Validation NX graphs: 13083, Validation PyG graphs: 13083
Test NX graphs: 13084, Test PyG graphs: 13084


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize Graph2Vec model
g2v_model = Graph2Vec(
    wl_iterations=3,
    use_node_attribute='x', # Use 'x' for QM9 atom features, or None for degrees
    dimensions=128,
    workers=4,
    down_sampling=0.01,
    epochs=10,
    learning_rate=0.1,
    min_count=5,
    seed=42,
    erase_base_features=False,
)

# Fit the model on the training data (NetworkX graphs)
g2v_model.fit(train_graphs_subset)

# Get embeddings for the training set
train_embeddings = g2v_model.get_embedding()

# Infer embeddings for validation and test sets
val_embeddings = g2v_model.infer(val_graphs_subset)
test_embeddings = g2v_model.infer(test_graphs_subset)

# Convert to PyTorch tensors - these are your X data
x_train = torch.tensor(train_embeddings, dtype=torch.float32).to(device)
x_val = torch.tensor(val_embeddings, dtype=torch.float32).to(device)
x_test = torch.tensor(test_embeddings, dtype=torch.float32).to(device)

print(f'x_train shape: {x_train.shape}')
print(f'x_val shape: {x_val.shape}')
print(f'x_test shape: {x_test.shape}')

x_train shape: torch.Size([104664, 128])
x_val shape: torch.Size([13083, 128])
x_test shape: torch.Size([13084, 128])


In [47]:
# Indices for HOMO and LUMO in QM9 target properties
# HOMO is at index 2, LUMO is at index 3 (0-indexed)
# target_indices = [2, 3]
target_indices = [0, 1, 2, 3, 4]

# These are your Y data, selecting only HOMO and LUMO
y_train = torch.stack([data.y[0, target_indices] for data in train_pyg_subset]).to(device)
y_val = torch.stack([data.y[0, target_indices] for data in val_pyg_subset]).to(device)
y_test = torch.stack([data.y[0, target_indices] for data in test_pyg_subset]).to(device)

print(f'x_train shape: {x_train.shape}, y_train shape: {y_train.shape}')
print(f'x_val shape: {x_val.shape}, y_val shape: {y_val.shape}')
print(f'x_test shape: {x_test.shape}, y_test shape: {y_test.shape}')
print(f'x_train device: {x_train.device}, y_train device: {y_train.device}')

x_train shape: torch.Size([104664, 128]), y_train shape: torch.Size([104664, 5])
x_val shape: torch.Size([13083, 128]), y_val shape: torch.Size([13083, 5])
x_test shape: torch.Size([13084, 128]), y_test shape: torch.Size([13084, 5])
x_train device: cuda:0, y_train device: cuda:0


In [42]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_squared_error

class MultiTargetMLP(nn.Module):
    def __init__(self, input_dim=15, hidden_dims=[128, 256, 64], output_dim=14):
        super(MultiTargetMLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(dim))
            layers.append(nn.Dropout(0.2))
            prev_dim = dim
            
        layers.append(nn.Linear(prev_dim, output_dim))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)


In [48]:
# Create dataset and dataloader
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Initialize model and optimizer
model = MultiTargetMLP(input_dim=x_train.shape[1], output_dim=y_train.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)
criterion = nn.MSELoss()

# Training loop
n_epochs = 200
best_val_mse = float('inf')
patience = 10
counter = 0

for epoch in range(n_epochs):
    model.train()
    train_loss = 0.0
    
    for X_batch, y_batch in train_loader:        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(x_val)
        val_loss = criterion(val_outputs, y_val).item()
    
    print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val MSE: {val_loss:.4f}")
    
    # Early stopping
    if val_loss < best_val_mse:
        best_val_mse = val_loss
        counter = 0
        # Save model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

with torch.no_grad():
    test_preds = model(x_test).cpu().numpy()
    test_mse = mean_squared_error(y_test.cpu().numpy(), test_preds)
    print(f"Test MSE with MLP: {test_mse:.4f}")

Epoch 1/200, Train Loss: 939.6244, Val MSE: 568.0020
Epoch 2/200, Train Loss: 235.8558, Val MSE: 42.9129
Epoch 3/200, Train Loss: 20.3711, Val MSE: 6.4702
Epoch 4/200, Train Loss: 13.5209, Val MSE: 6.0164
Epoch 5/200, Train Loss: 12.6921, Val MSE: 5.7453
Epoch 6/200, Train Loss: 12.1297, Val MSE: 5.7582
Epoch 7/200, Train Loss: 11.5151, Val MSE: 5.6403
Epoch 8/200, Train Loss: 11.1881, Val MSE: 5.4031
Epoch 9/200, Train Loss: 10.8372, Val MSE: 5.2643
Epoch 10/200, Train Loss: 10.6541, Val MSE: 5.2320
Epoch 11/200, Train Loss: 10.4700, Val MSE: 5.1925
Epoch 12/200, Train Loss: 10.3535, Val MSE: 5.1217
Epoch 13/200, Train Loss: 10.2968, Val MSE: 4.9268
Epoch 14/200, Train Loss: 10.1172, Val MSE: 4.9956
Epoch 15/200, Train Loss: 9.9976, Val MSE: 4.8073
Epoch 16/200, Train Loss: 9.9168, Val MSE: 4.8590
Epoch 17/200, Train Loss: 9.8378, Val MSE: 4.8664
Epoch 18/200, Train Loss: 9.8298, Val MSE: 4.7012
Epoch 19/200, Train Loss: 9.7220, Val MSE: 4.7595
Epoch 20/200, Train Loss: 9.6974, Val MS

HOMO LUMO  
embedding optim  
dim 16, lr 0.025 - Test MSE with MLP: 0.6305    - wl iter 2, downsampling 0.0001  
dim 16, lr 0.05  - Test MSE with MLP: 0.3456    - wl iter 2, downsampling 0.0001  
dim 16, lr 0.1   - Test MSE with MLP: 0.3108    - wl iter 2, downsampling 0.0001  
dim 16, lr 0.5   - Test MSE with MLP: 1.0037    - wl iter 2, downsampling 0.0001  
dim 32, lr 0.1   - Test MSE with MLP: 0.2033    - wl iter 2, downsampling 0.0001  
dim 64, lr 0.1   - Test MSE with MLP: 0.1669    - wl iter 2, downsampling 0.0001  
dim 128, lr 0.1  - Test MSE with MLP: 0.1251    - bigger MLP & wl iter 2, downsampling 0.0001  
dim 128, lr 0.1  - Test MSE with MLP: 0.1004    - bigger MLP & wl iter 3, downsampling 0.0001  
dim 128, lr 0.1  - Test MSE with MLP: 0.0801    - bigger MLP & wl iter 3, downsampling 0.01  
dim 128, lr 0.1  - Test MSE with MLP: 0.0827    - bigger MLP & wl iter 3, downsampling 0.1  

mlp lr
dim 128, lr 0.1  - Test MSE with MLP: 0.1141    but less epoch     - bigger MLP with 0.01 lr instead of 0.001 & wl iter 3 instead of 2, downsampling 0.01  
dim 128, lr 0.1  - Test MSE with MLP: 0.0966                       - bigger MLP with 0.005 lr & wl iter 3 instead of 2, downsampling 0.01  
BEST  
dim 128, lr 0.1  - Test MSE with MLP: 0.0798                       - bigger MLP with 0.0005 lr & wl iter 3 instead of 2, downsampling 0.01  

First 5 property with the same settings  
Test MSE with MLP: 4.4426  