In [3]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.4-cp311-cp311-manylinux_2_28_x86_64.whl (34.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.4


In [4]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from rdkit.Chem import rdFingerprintGenerator, MolFromSmiles

import numpy as np
import pandas as pd

In [5]:
class SolubilityDataset(Dataset):
    def __init__(self, df):
        self.fingerprints = torch.tensor(np.stack(df['fingerprint'])).type(torch.float)
        self.labels = torch.tensor(df['Solubility'].values).type(torch.float)
        assert len(self.fingerprints) == len(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        fingerprint = self.fingerprints[idx]
        label = self.labels[idx]
        return fingerprint, label

In [7]:
df = pd.read_csv('solubility.csv')

mfpgen = rdFingerprintGenerator.GetMorganGenerator()
smiles2fp = lambda smi: mfpgen.GetFingerprintAsNumPy(MolFromSmiles(smi)).astype(int)

df['rdkit_molecule'] = df['SMILES'].map(MolFromSmiles)
df['fingerprint'] = df['rdkit_molecule'].map(lambda mol: mfpgen.GetFingerprintAsNumPy(mol).astype(int))

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

In [8]:
train_dataset = SolubilityDataset(df_train)
test_dataset = SolubilityDataset(df_test)
val_dataset = SolubilityDataset(df_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

In [9]:
class SolubilityNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        return self.stack(x)

In [10]:
def train_step(model, optimizer, loader, criterion=nn.MSELoss()):
    model.train()
    total_loss = 0
    for fingerprints, labels in loader:
        optimizer.zero_grad()
        outputs = model(fingerprints)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(labels)
    return total_loss / len(loader.dataset)

def test_step(model, loader, criterion=nn.L1Loss()):
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for fingerprints, labels in loader:
            outputs = model(fingerprints)
            valid_loss += criterion(outputs.squeeze(), labels).item()
    valid_loss /= len(loader)
    return valid_loss

In [11]:
model = SolubilityNet()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 20
train_losses, valid_losses = list(), list()
for epoch in range(num_epochs):
    train_loss = train_step(model, optimizer, train_loader)
    val_loss = test_step(model, val_loader)

    print(f'Epoch: {epoch + 1:03d}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

Epoch: 001, Train Loss: 2.7337, Validation Loss: 0.9858
Epoch: 002, Train Loss: 1.1678, Validation Loss: 0.9532
Epoch: 003, Train Loss: 0.6913, Validation Loss: 0.9419
Epoch: 004, Train Loss: 0.4487, Validation Loss: 0.9067
Epoch: 005, Train Loss: 0.3485, Validation Loss: 0.9263


In [None]:
test_step(model, test_loader)