In [1]:
import pandas as pd
import numpy as np
import torch
import torch_geometric
from torch import nn, optim
from torch.nn import functional as Fy
from torch.utils import data
import math
from sklearn.metrics import roc_auc_score
import sys
from os.path import exists
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
from torch_geometric.utils import from_smiles
from torch.utils.data import random_split
from torch_geometric.loader import DataLoader
from torch_geometric.nn.models import AttentiveFP

import build_graphs as bg

In [2]:
from sklearn.preprocessing import StandardScaler

df = pd.read_pickle('mat_bandgap_morgan.pkl')
df = df.drop(['Morgan','confnum','homo','lumo'],axis=1)
mols = [Chem.MolFromSmiles(x) for x in df['smiles']]
print(df)

scaler = StandardScaler()
bandgaps = df['gap'].values.reshape(-1, 1)
bandgaps_scaled = scaler.fit_transform(bandgaps)


                                                smiles    gap
0    COC(=O)/C(=C/c1cc(C)c(c2ccc(c3sc(c4cc5c(s4)c(O...  0.235
1                  Cc1csc(c2ccc(c3cc(C)cs3)c3nsnc23)c1  0.262
2    COC(=O)/C(=C/c1cc(C)c(c2cc(C)c(c3ccc(c4sc(c5sc...  0.234
3    C[Si]1(C)c2ccsc2c2sc(c3nc4sc(c5cc6c(s5)c5sc(c7...  0.245
4    COc1c2ccsc2c(OC)c2cc(c3sc(c4scc5c4[C@@H]4C=C[C...  0.300
..                                                 ...    ...
311  Cc1c2ccsc2c(C)c2cc(c3ccc(c4cnc(c5cccs5)c5nsnc4...  0.236
312       CC(=O)c1cc2c(csc2c2cc3c(s2)c(C)c2ccsc2c3C)s1  0.276
313  Cc1cc(c2ccc(N(c3ccccc3)c3ccccc3)cc2)sc1c1cnc(c...  0.258
314  Cc1ccc(C2(c3ccc(C)cc3)c3ccsc3c3cc4c(cc23)c2sc(...  0.255
315  Cc1cc(c2cc3c4nsnc4c(c4cc(C)c(c5cccs5)s4)cc3c3n...  0.231

[316 rows x 2 columns]


In [3]:
# example for one molecule:

smile = df['smiles'][35]
print(smile)

c1csc(c2ccsc2c2cccs2)c1


In [4]:
# this generates a torch Data object for the molecule

g = bg.from_smiles(smile, with_hydrogen=True, kekulize=True, use_3d=True)
print(g)

Data(x=[23, 16], edge_index=[2, 50], edge_attr=[50, 3], smiles='c1csc(c2ccsc2c2cccs2)c1')


In [5]:
# this prints the node feature matrix - so all the atoms in the molecule and some metrics for them
g.x

tensor([[ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [16,  0,  2,  5,  0,  0,  3,  1,  1,  1,  2,  0,  2, 32,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [16,  0,  2,  5,  0,  0,  3,  1,  1,  1,  2,  0,  2, 32,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [ 6,  0,  3,  5,  0,  0,  3,  1,  1,  1,  4,  0,  4, 12,  0,  5],
        [16,  0,  2,  5,  0,  0,  3,  

In [6]:
# this prints the number of nodes:

g.num_nodes

23

In [7]:
g.num_node_features

16

In [10]:
g.edge_index

tensor([[ 0,  0,  0,  1,  1,  1,  2,  2,  3,  3,  3,  4,  4,  4,  5,  5,  5,  6,
          6,  6,  7,  7,  8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12,
         12, 13, 13, 14, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22],
        [ 1, 14, 15,  0,  2, 16,  1,  3,  2,  4, 14,  3,  5,  8,  4,  6, 17,  5,
          7, 18,  6,  8,  4,  7,  9,  8, 10, 13,  9, 11, 19, 10, 12, 20, 11, 13,
         21,  9, 12,  0,  3, 22,  0,  1,  5,  6, 10, 11, 12, 14]])

In [11]:
g.edge_attr

tensor([[2, 0, 1],
        [1, 0, 1],
        [1, 0, 0],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 0],
        [1, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 0],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 0],
        [1, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 0],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 0],
        [2, 0, 1],
        [1, 0, 1],
        [1, 0, 0],
        [1, 0, 1],
        [1, 0, 1],
        [1, 0, 1],
        [2, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]])

In [12]:
# this generates data objects for all the molecules in the dataset

graph_list = []

for i, smile in enumerate(df['smiles']):
    g = bg.from_smiles(smile)
    g.x = g.x.float()
    y = torch.tensor(bandgaps_scaled[i],dtype=torch.float).view(1,-1)
    g.y = y
    graph_list.append(g)

In [13]:
type(graph_list[0])

torch_geometric.data.data.Data

In [14]:
graph_list[1]

Data(x=[21, 16], edge_index=[2, 48], edge_attr=[48, 3], smiles='Cc1csc(c2ccc(c3cc(C)cs3)c3nsnc23)c1', y=[1, 1])

In [15]:
# train test split

train_ratio = 0.80
dataset_size = len(graph_list)
train_size = int(train_ratio*dataset_size)
test_size = dataset_size-train_size

generator1 = torch.Generator().manual_seed(42)
train_dataset, test_dataset = random_split(graph_list,[train_size,test_size], generator=generator1)

In [16]:
len(train_dataset)
len(test_dataset)

64

In [17]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [18]:
graph_list[0]

Data(x=[66, 16], edge_index=[2, 148], edge_attr=[148, 3], smiles='COC(=O)/C(=C/c1cc(C)c(c2ccc(c3sc(c4cc5c(s4)c(OC)c4cc(c6cc(C)c(c7ccc(c8sc(/C=C(\C#N)/C(=O)OC)cc8C)s7)s6)sc4c5OC)cc3C)s2)s1)/C#N', y=[1, 1])

In [19]:

model = AttentiveFP(in_channels=16,hidden_channels=64,out_channels=1,edge_dim=3,num_layers=2,num_timesteps=2,dropout=0.4)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001,weight_decay=1e-4)

loss_function = nn.MSELoss()  

In [63]:
print(model)

AttentiveFP(in_channels=11, hidden_channels=64, out_channels=1, edge_dim=3, num_layers=2, num_timesteps=2)


In [None]:
# simple training and evaluation loop to train Attentive FP model 

from sklearn.metrics import r2_score

EPOCHS = 600

for i in range(EPOCHS):
    loss_list_train = []
    pred_list_train = []  
    true_list_train = []  
    
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
    
        output = model(data.x, data.edge_index, data.edge_attr, data.batch)

        loss = loss_function(output, data.y)
        loss.backward()  
        loss_list_train.append(loss.item())
        optimizer.step()  
        
        # Store predictions 
        pred_list_train.extend(output.detach().cpu().numpy().flatten())
        true_list_train.extend(data.y.cpu().numpy().flatten())
    
    # Calculate R² 
    r2_train = r2_score(true_list_train, pred_list_train)
        
    loss_list_test = []
    pred_list_test = []  
    true_list_test = []  
    
    model.eval()
    with torch.no_grad():
        for data in test_loader:
            output = model(data.x, data.edge_index, data.edge_attr, data.batch)

            loss = loss_function(output, data.y)
            loss_list_test.append(loss.item())
     
            # Store predictions 
            pred_list_test.extend(output.cpu().numpy().flatten())
            true_list_test.extend(data.y.cpu().numpy().flatten())
    
    # Calculate R²  
    r2_test = r2_score(true_list_test, pred_list_test)
                
    print(i, "Train Loss: %.4f Train R²: %.4f Test Loss: %.4f Test R²: %.4f"
        % (np.mean(loss_list_train), r2_train, 
           np.mean(loss_list_test), r2_test))

0 Train Loss: 27.9217 Train R²: -26.6300 Test Loss: 2.6284 Test R²: -1.8834
1 Train Loss: 5.1445 Train R²: -4.0784 Test Loss: 1.1823 Test R²: -0.2971
2 Train Loss: 1.3380 Train R²: -0.3226 Test Loss: 0.6895 Test R²: 0.2436
3 Train Loss: 1.1303 Train R²: -0.1056 Test Loss: 0.6387 Test R²: 0.2994
4 Train Loss: 0.9978 Train R²: 0.0129 Test Loss: 0.6220 Test R²: 0.3177
5 Train Loss: 0.8521 Train R²: 0.1574 Test Loss: 0.7477 Test R²: 0.1797
6 Train Loss: 0.8738 Train R²: 0.1388 Test Loss: 1.6490 Test R²: -0.8090
7 Train Loss: 0.8334 Train R²: 0.1817 Test Loss: 0.7676 Test R²: 0.1579
8 Train Loss: 0.8403 Train R²: 0.1724 Test Loss: 0.6705 Test R²: 0.2644
9 Train Loss: 0.7780 Train R²: 0.2393 Test Loss: 0.6770 Test R²: 0.2573
10 Train Loss: 0.6508 Train R²: 0.3616 Test Loss: 0.8165 Test R²: 0.1043
11 Train Loss: 0.7319 Train R²: 0.2866 Test Loss: 0.5393 Test R²: 0.4084
12 Train Loss: 0.7553 Train R²: 0.2714 Test Loss: 0.5487 Test R²: 0.3980
13 Train Loss: 0.7637 Train R²: 0.2478 Test Loss: 0.

In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np
import copy

fold_models = []

# 5-fold cross validation

# Setup
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
EPOCHS = 600

for fold, (train_idx, val_idx) in enumerate(kfold.split(graph_list)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/5")
    print(f"{'='*50}\n")
    
    # Create fresh model for this fold
    model = AttentiveFP(in_channels=16,hidden_channels=200,out_channels=1,edge_dim=3,num_layers=4,num_timesteps=3,dropout=0.095)
    optimizer = torch.optim.Adam(model.parameters(),lr=0.0013,weight_decay=0.00014)
    loss_function = torch.nn.MSELoss()  # Or your loss function
    
#hidden_channels': 200, 'num_layers': 4, 'num_timesteps': 3, 'dropout': 0.09517795284321594, 'lr': 0.0013269356447544063, 'weight_decay': 0.00014218285610078816, 'batch_size': 16

    # Split data for this fold
    train_graphs = [graph_list[i] for i in train_idx]
    val_graphs = [graph_list[i] for i in val_idx]
    
    train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_graphs, batch_size=32, shuffle=False)
    
    # Track best model for this fold
    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0
    patience = 50
    
    # Training loop for this fold
    for epoch in range(EPOCHS):
        # TRAINING
        loss_list_train = []
        pred_list_train = []  
        true_list_train = []  
        
        model.train()
        for data in train_loader:
            optimizer.zero_grad()
        
            output = model(data.x, data.edge_index, data.edge_attr, data.batch)
            loss = loss_function(output, data.y)
            loss.backward()  
            loss_list_train.append(loss.item())
            optimizer.step()  
            
            pred_list_train.extend(output.detach().cpu().numpy().flatten())
            true_list_train.extend(data.y.cpu().numpy().flatten())
        
        r2_train = r2_score(true_list_train, pred_list_train)
        mean_train_loss = np.mean(loss_list_train)
            
        # VALIDATION
        loss_list_val = []
        pred_list_val = []  
        true_list_val = []  
        
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                output = model(data.x, data.edge_index, data.edge_attr, data.batch)
                loss = loss_function(output, data.y)
                loss_list_val.append(loss.item())
         
                pred_list_val.extend(output.cpu().numpy().flatten())
                true_list_val.extend(data.y.cpu().numpy().flatten())
        
        r2_val = r2_score(true_list_val, pred_list_val)
        mean_val_loss = np.mean(loss_list_val)
        
        # Save best model
        if mean_val_loss < best_val_loss:
            best_val_loss = mean_val_loss
            best_model_state = copy.deepcopy(model.state_dict())
            best_r2_val = r2_val
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
                    
        if epoch % 20 == 0:  # Print every 20 epochs
            print(f"Epoch {epoch}: Train Loss: {mean_train_loss:.4f} Train R²: {r2_train:.4f} "
                  f"Val Loss: {mean_val_loss:.4f} Val R²: {r2_val:.4f}")
            

    # Load best model state for this fold
    model.load_state_dict(best_model_state)
    
    # Save the model
    torch.save(best_model_state, f'attentivefp_fold_{fold+1}.pt')
    fold_models.append(copy.deepcopy(model))  # Keep in memory too
    
    # Store results for this fold
    fold_results.append({
        'fold': fold + 1,
        'best_val_loss': best_val_loss,
        'best_val_r2': best_r2_val,
        'model': model  # Optional: keep reference
    })
    
    print(f"\nFold {fold + 1} Results:")
    print(f"Best Val Loss: {best_val_loss:.4f}")
    print(f"Best Val R²: {best_r2_val:.4f}")

# Summary across all folds
print(f"\n{'='*50}")
print("CROSS-VALIDATION SUMMARY")
print(f"{'='*50}")
val_losses = [f['best_val_loss'] for f in fold_results]
val_r2s = [f['best_val_r2'] for f in fold_results]

print(f"Mean Val Loss: {np.mean(val_losses):.4f} ± {np.std(val_losses):.4f}")
print(f"Mean Val R²: {np.mean(val_r2s):.4f} ± {np.std(val_r2s):.4f}")
print(f"\nIndividual Folds:")
for result in fold_results:
    print(f"  Fold {result['fold']}: Loss={result['best_val_loss']:.4f}, R²={result['best_val_r2']:.4f}")


FOLD 1/5

Epoch 0: Train Loss: 16.2752 Train R²: -14.7845 Val Loss: 1.3063 Val R²: -0.6209
Epoch 20: Train Loss: 0.5605 Train R²: 0.4689 Val Loss: 0.5173 Val R²: 0.3580
Epoch 40: Train Loss: 0.4841 Train R²: 0.5387 Val Loss: 0.4878 Val R²: 0.3946
Epoch 60: Train Loss: 0.4662 Train R²: 0.5542 Val Loss: 0.5530 Val R²: 0.3138
Epoch 80: Train Loss: 0.3412 Train R²: 0.6729 Val Loss: 0.3029 Val R²: 0.6242
Epoch 100: Train Loss: 0.2735 Train R²: 0.7378 Val Loss: 0.2745 Val R²: 0.6593
Epoch 120: Train Loss: 0.2240 Train R²: 0.7851 Val Loss: 0.2388 Val R²: 0.7037
Epoch 140: Train Loss: 0.2294 Train R²: 0.7792 Val Loss: 0.1934 Val R²: 0.7600
Epoch 160: Train Loss: 0.1613 Train R²: 0.8446 Val Loss: 0.1960 Val R²: 0.7568
Epoch 180: Train Loss: 0.1218 Train R²: 0.8835 Val Loss: 0.1991 Val R²: 0.7529
Early stopping at epoch 197

Fold 1 Results:
Best Val Loss: 0.1679
Best Val R²: 0.7917

FOLD 2/5

Epoch 0: Train Loss: 82.1585 Train R²: -83.7337 Val Loss: 5.7471 Val R²: -4.3391
Epoch 20: Train Loss: 

In [24]:
# See if best_model_state exists
print('best_model_state' in dir())

# Or check fold_results
print(fold_results)

True
[{'fold': 1, 'best_val_loss': np.float64(0.24105320125818253), 'best_val_r2': 0.7008807884380686}, {'fold': 2, 'best_val_loss': np.float64(0.2490873783826828), 'best_val_r2': 0.7682296322880371}, {'fold': 3, 'best_val_loss': np.float64(0.20825927704572678), 'best_val_r2': 0.6877712445998967}, {'fold': 4, 'best_val_loss': np.float64(0.2778569757938385), 'best_val_r2': 0.744636401399734}, {'fold': 5, 'best_val_loss': np.float64(0.4175241142511368), 'best_val_r2': 0.6831592452897886}]


In [22]:
# Hyperparameter tuning with train-test split
import optuna
from optuna.trial import Trial
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

# Create train-test split once (80-20 split)
train_idx, test_idx = train_test_split(
    range(len(graph_list)), 
    test_size=0.2, 
    random_state=42
)

train_graphs = [graph_list[i] for i in train_idx]
test_graphs = [graph_list[i] for i in test_idx]

def objective_attentivefp(trial: Trial):
    """Optuna objective for AttentiveFP"""
    
    # AttentiveFP-specific hyperparameters
    hidden_channels = trial.suggest_categorical('hidden_channels', [32, 64, 128, 200])
    num_layers = trial.suggest_int('num_layers', 2, 5)
    num_timesteps = trial.suggest_int('num_timesteps', 1, 3)
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    
    # Training hyperparameters
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    
    # Create AttentiveFP model
    model = AttentiveFP(
        in_channels=16,
        hidden_channels=hidden_channels,
        out_channels=1,
        edge_dim=3,
        num_layers=num_layers,
        num_timesteps=num_timesteps,
        dropout=dropout
    )
    
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )
    
    # Data loaders
    train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)
    
    # Training
    best_test_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(150):
        # Train
        model.train()
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.edge_attr, data.batch)
            loss = torch.nn.functional.mse_loss(out, data.y)
            loss.backward()
            optimizer.step()
        
        # Test
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for data in test_loader:
                out = model(data.x, data.edge_index, data.edge_attr, data.batch)
                loss = torch.nn.functional.mse_loss(out, data.y)
                test_loss += loss.item()
        test_loss /= len(test_loader)
        
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= 20:
            break
        
        # Pruning
        trial.report(test_loss, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return best_test_loss

# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective_attentivefp, n_trials=50)

print("\nBest AttentiveFP hyperparameters:")
print(study.best_params)

[I 2026-01-29 13:38:53,059] A new study created in memory with name: no-name-26591616-0f4a-4834-ae7c-c0eab92185d1
[I 2026-01-29 13:41:27,356] Trial 0 finished with value: 0.2766966037452221 and parameters: {'hidden_channels': 128, 'num_layers': 4, 'num_timesteps': 3, 'dropout': 0.23882434565022131, 'lr': 0.0015923126382256317, 'weight_decay': 7.454973991521727e-05, 'batch_size': 16}. Best is trial 0 with value: 0.2766966037452221.
[I 2026-01-29 13:43:38,188] Trial 1 finished with value: 0.31975647807121277 and parameters: {'hidden_channels': 128, 'num_layers': 2, 'num_timesteps': 3, 'dropout': 0.43611140282317074, 'lr': 0.0005566502852715476, 'weight_decay': 0.0002564288066972483, 'batch_size': 16}. Best is trial 0 with value: 0.2766966037452221.
[I 2026-01-29 13:44:51,441] Trial 2 finished with value: 0.22972967475652695 and parameters: {'hidden_channels': 32, 'num_layers': 4, 'num_timesteps': 3, 'dropout': 0.12261492587333833, 'lr': 0.0019778710959787536, 'weight_decay': 1.3516398871


Best AttentiveFP hyperparameters:
{'hidden_channels': 200, 'num_layers': 4, 'num_timesteps': 3, 'dropout': 0.09517795284321594, 'lr': 0.0013269356447544063, 'weight_decay': 0.00014218285610078816, 'batch_size': 16}
