In [None]:
!pip install rdkit
import torch
print(torch.__version__)  # 2.4.0+cu121
!pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html

2.5.1+cu121
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_scatter-2.1.2%2Bpt25cu121-cp311-cp311-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_sparse-0.6.18%2Bpt25cu121-cp311-cp311-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.

In [None]:
import torch
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/volkamerlab/EDSAI_CADD_intro/refs/heads/main/data/solubility.csv")

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

In [25]:
from torch_geometric.utils import from_smiles
from torch_geometric.loader import DataLoader

def process_smiles(row):
    data = from_smiles(row.SMILES)
    data.x = data.x.to(torch.float)
    data.y = torch.tensor(row.Solubility, dtype=torch.float)
    return data

train_dataloader = DataLoader(list(map(process_smiles, df_train.itertuples())), batch_size=32, shuffle=True)
valid_dataloader = DataLoader(list(map(process_smiles, df_val.itertuples())), batch_size=32)
test_dataloader = DataLoader(list(map(process_smiles, df_test.itertuples())), batch_size=32)

In [None]:
from torch_geometric.nn import global_mean_pool, GCNConv

import torch
import torch.nn.functional as Fun
from torch.nn import Linear, Sequential, BatchNorm1d, ReLU

from torch_geometric.nn import GCNConv, GINConv
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool, global_add_pool

import torch.nn.functional as Fun


class GIN(torch.nn.Module):
    """Graph Isomorphism Network class with 3 GINConv layers and 2 linear layers"""

    def __init__(self, dim_h):
        """Initializing GIN class

        Args:
            dim_h (int): the dimension of hidden layers
        """
        super(GIN, self).__init__()

        self.conv1 = GINConv(
            Sequential(Linear(9, dim_h), ReLU(), Linear(dim_h, dim_h), ReLU())
        )
        self.conv2 = GINConv(
            Sequential(
                Linear(dim_h, dim_h), ReLU(), Linear(dim_h, dim_h), ReLU()
            )
        )
        self.conv3 = GINConv(
            Sequential(
                Linear(dim_h, dim_h), ReLU(), Linear(dim_h, dim_h), ReLU()
            )
        )

        self.lin1 = Linear(dim_h, dim_h)
        self.lin2 = Linear(dim_h, 1)

    def forward(self, data):
        x = data.x
        edge_index = data.edge_index
        batch = data.batch

        # Node embeddings
        h = self.conv1(x, edge_index)
        h = h.relu()
        h = self.conv2(h, edge_index)
        h = h.relu()
        h = self.conv3(h, edge_index)

        # Graph-level readout
        h = global_add_pool(h, batch)

        h = self.lin1(h)
        h = h.relu()
        h = Fun.dropout(h, p=0.1, training=self.training)
        h = self.lin2(h)

        return h

model = GIN(32)
model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

def train_step(loader):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out.squeeze(), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_dataloader.dataset)

def test_step(loader):
    model.eval()
    total_loss = 0
    for data in loader:
        out = model(data)
        loss = criterion(out.squeeze(), data.y)
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)

# Training loop
num_epochs = 100
train_losses, valid_losses = list(), list()
for epoch in range(num_epochs):
    train_loss = train_step(train_dataloader)
    valid_loss = test_step(valid_dataloader)


    print(f'Epoch: {epoch + 1:03d}, Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')

Epoch: 001, Train Loss: 4.2744, Validation Loss: 3.4909
Epoch: 002, Train Loss: 4.0231, Validation Loss: 3.5483
Epoch: 003, Train Loss: 3.6402, Validation Loss: 3.5977
Epoch: 004, Train Loss: 3.2996, Validation Loss: 3.2183
Epoch: 005, Train Loss: 3.1952, Validation Loss: 2.8149
Epoch: 006, Train Loss: 3.3809, Validation Loss: 3.2839
Epoch: 007, Train Loss: 3.4329, Validation Loss: 3.2354
Epoch: 008, Train Loss: 3.2913, Validation Loss: 3.2036
Epoch: 009, Train Loss: 2.9337, Validation Loss: 3.0710
Epoch: 010, Train Loss: 2.9070, Validation Loss: 2.6792
Epoch: 011, Train Loss: 3.2020, Validation Loss: 3.3337
Epoch: 012, Train Loss: 3.3572, Validation Loss: 3.0625
Epoch: 013, Train Loss: 3.1666, Validation Loss: 2.6658
Epoch: 014, Train Loss: 2.7807, Validation Loss: 2.4710
Epoch: 015, Train Loss: 2.8841, Validation Loss: 3.7423
Epoch: 016, Train Loss: 3.4481, Validation Loss: 3.3935
Epoch: 017, Train Loss: 3.3739, Validation Loss: 3.5376
Epoch: 018, Train Loss: 3.5955, Validation Loss: