In [3]:
import pandas as pd
import dgl
from rdkit import Chem
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv
import torch
from torch.optim import Adam
import warnings
warnings.filterwarnings('ignore')

In [4]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
def atom_features(atom):
    atom_types = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']
    return [1 if atom.GetSymbol() == atom_type else 0 for atom_type in atom_types]

def mol_to_graph(mol):
    g = dgl.DGLGraph()
    g.add_nodes(mol.GetNumAtoms())
    
    for bond in mol.GetBonds():
        g.add_edges(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
        g.add_edges(bond.GetEndAtomIdx(), bond.GetBeginAtomIdx())
    
    # Assign atom features to nodes
    g.ndata['h'] = torch.tensor([atom_features(atom) for atom in mol.GetAtoms()])
    
    return g


# Convert SMILES to molecular graphs with features
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_data['SMILES']]
train_graphs = [mol_to_graph(mol) for mol in train_mols if mol]


In [4]:
labels = train_data[['MLM', 'HLM']].values
labels = torch.tensor(labels, dtype=torch.float32)


In [9]:
class GNNModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GNNModel, self).__init__()
        self.layers = nn.ModuleList([
            GraphConv(in_dim, hidden_dim, activation=F.relu),
            GraphConv(hidden_dim, hidden_dim, activation=F.relu)
        ])
        self.classify = nn.Linear(hidden_dim, out_dim)

    def forward(self, g, features):
        h = features
        # print("Initial feature shape:", h.shape)
        
        for idx, conv in enumerate(self.layers):
            h = conv(g, h)
            # print(f"Feature shape after layer {idx+1}:", h.shape)
        
        g.ndata['h'] = h
        logits = self.classify(dgl.mean_nodes(g, 'h'))
        # print("Output logits shape:", logits.shape)
        
        return logits


# Use the correct input size for the model based on the size of the atom features
input_size = train_graphs[0].ndata['h'].shape[1]
model = GNNModel(in_dim=input_size, hidden_dim=128, out_dim=2)

optimizer = Adam(model.parameters(), lr=1e-5)

In [10]:
mean = labels.mean(dim=0)
std = labels.std(dim=0)
normalized_labels = (labels - mean) / std


In [11]:
criterion = nn.MSELoss()

for epoch in range(10):  # Assume 10 epochs for this example
    model.train()
    total_loss = 0
    for idx, g in enumerate(train_graphs):
        try:
            logits = model(g, g.ndata['h'])
            loss = criterion(logits, normalized_labels[idx].unsqueeze(0))  # Use normalized_labels here
        
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            
            total_loss += loss.item()
        except RuntimeError as e:
            print(f"Error at graph index {idx}. Graph node feature shape: {g.ndata['h'].shape}")
            problematic_mol = train_mols[idx]
            print(Chem.MolToSmiles(problematic_mol))
            break  # Let's break the loop to inspect the problematic molecule
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_graphs)}")

Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 1, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 2, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 3, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 4, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 5, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 6, Loss: 0.0
Error at graph index 0. Graph node feature shape: torch.Size([28, 128])
CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC
Epoch 7, Loss: 0.0
Error at graph index 0. Gra

In [12]:
# Extract the problematic graph and its corresponding label
problematic_graph = train_graphs[0]
problematic_label = normalized_labels[0]  # Use normalized_label here

# Forward pass with just the problematic graph
logits = model(problematic_graph, problematic_graph.ndata['h'])
loss = criterion(logits, problematic_label.unsqueeze(0))
print("Logits:", logits)
print("Loss:", loss.item())

Logits: tensor([[0.0833, 0.0045]], grad_fn=<AddmmBackward0>)
Loss: 0.08334333449602127


In [8]:
import deepchem as dc

# Featurize train data
featurizer = dc.feat.ConvMolFeaturizer()
train_features = featurizer.featurize(train_data['SMILES'])
train_dataset = dc.data.NumpyDataset(X=train_features, 
                                     y=train_data[['MLM', 'HLM']].values)

# Featurize test data
test_features = featurizer.featurize(test_data['SMILES'])
test_dataset = dc.data.NumpyDataset(X=test_features)


In [31]:
from deepchem.models.graph_models import GraphConvModel
import tensorflow
model = GraphConvModel(n_tasks=2, mode='regression', dropout=0.2)


ImportError: cannot import name 'KerasModel' from 'deepchem.models' (/Users/syshin/miniforge3/envs/pytorch/lib/python3.9/site-packages/deepchem/models/__init__.py)

In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))


In [21]:
predictions = model.predict(test_dataset)


AttributeError: 'GNNModel' object has no attribute 'predict'