# Drug absorption
## Caco-2 cell effective permeability

https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al

Task: Regression. Given a drug SMILES string, predict the Caco-2 cell effective permeability.

Dataset split: scaffold split -- forces training and test set to have distant molecular structures. This is to help with generalizability, since drug structures of interest evolve over time.

Dataset reference: [Wang, NN et al, ADME Properties Evaluation in Drug Discovery: Prediction of Caco-2 Cell Permeability Using a Combination of NSGA-II and Boosting, Journal of Chemical Information and Modeling 2016 56 (4), 763-773](https://pubmed.ncbi.nlm.nih.gov/27018227/)

In [1]:
import numpy as np
import pandas as pd

# cheminformatics
import rdkit.Chem
from rdkit.Chem import Descriptors

# logging
import tqdm

# data preprocessing
import sklearn.impute
import sklearn.preprocessing

# modeling
import torch
import torch.utils.data

# plotting
import holoviews as hv
hv.extension('bokeh')



### Load dataset

In [2]:
from tdc.single_pred import ADME
data = ADME(name = 'Caco2_Wang')
split = data.get_split()

Found local copy...
Loading...
Done!


### Featurization

In [3]:
def add_descriptor_columns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Use rdkit to get descriptors of each drug in the `data` df.
    Return a Pandas DataFrame with the descriptors as columns in the df and .
    """
    
    # Extract the Drug column
    assert 'Drug' in data.columns, "'Drug' must be a column in the input DataFrame."
    drugs = data['Drug']
    y = data['Y']
    
    # Get the descriptors for each drug
    print("Calculating descriptors...")
    descriptors = []
    for drug, target in tqdm.tqdm(zip(drugs, y)):
        descriptor = Descriptors.CalcMolDescriptors(
            rdkit.Chem.MolFromSmiles(drug)
        )
        descriptor['Drug'] = drug
        descriptor['Y'] = target
        descriptors.append(descriptor)

    # Make a dataframe for the descriptors
    df = pd.DataFrame(descriptors)

    return df

In [4]:
train_df = add_descriptor_columns(split['train'])

Calculating descriptors...


637it [00:28, 22.52it/s]


In [5]:
val_df = add_descriptor_columns(split['valid'])

Calculating descriptors...


91it [00:04, 20.04it/s]


In [6]:
test_df = add_descriptor_columns(split['test'])

Calculating descriptors...


182it [00:07, 24.22it/s]


### Construct DataLoader

In [7]:
def construct_dataloader(
    data: pd.DataFrame,
    imputer=sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean'),
    fit_imputer=True,
    scaler=sklearn.preprocessing.RobustScaler(),
    fit_scaler=True,
) -> torch.utils.data.DataLoader:
    """
    Make a PyTorch DataLoader from a Pandas DataFrame.

    Optionally impute missing values and scale data.
    """

    # Extract target data
    y = data['Y'].values.reshape(-1, 1)
    
    # Create target tensor
    target = torch.tensor(y).to(torch.float32)
    
    # Extract just the feature data
    col_array = np.array(data.columns)
    X = data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].values

    # Impute missing data
    if imputer is not None:
        if fit_imputer:
            X = imputer.fit_transform(X)
        else:
            X = imputer.transform(X)

    # Scale the feature data
    if scaler is not None:
        if fit_scaler:
            X = scaler.fit_transform(X)
        else:
            X = scaler.transform(X)

    # Create features tensor
    features = torch.tensor(X).to(torch.float32)

    # Pass to DataLoader
    dataset = torch.utils.data.TensorDataset(features, target)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=16)

    return dataloader, imputer, scaler

In [8]:
train_dataloader, imputer, scaler = construct_dataloader(train_df)

In [9]:
val_dataloader, _, _ = construct_dataloader(
    val_df,
    imputer=imputer,
    fit_imputer=False,
    scaler=scaler,
    fit_scaler=False,
)

In [10]:
test_dataloader, _, _ = construct_dataloader(
    test_df,
    imputer=imputer,
    fit_imputer=False,
    scaler=scaler,
    fit_scaler=False,
)

### Construct NN

In [11]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [12]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.stack = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(210, 32),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(16, 1),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.stack(x)
        return logits

### Train & test functions

In [13]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Forward pass
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Print loss every 200 steps
        if batch % 200 == 0 or batch == len(dataloader) - 1:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    return loss

In [14]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
    test_loss /= num_batches
    
    print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")

    return test_loss

### Train and test the model

In [15]:
model = NeuralNetwork().to(device)
loss_fn = torch.nn.L1Loss() # MAE
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [16]:
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (stack): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=210, out_features=32, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=32, out_features=16, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.2, inplace=False)
    (7): Linear(in_features=16, out_features=1, bias=True)
  )
)


In [17]:
epochs = 500
training_losses = []
test_losses = []
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    training_losses.append(train(train_dataloader, model, loss_fn, optimizer))
    test_losses.append(test(test_dataloader, model, loss_fn))
print("Done!")

Epoch 1
-------------------------------
loss: 4.948452  [   16/  637]
loss: 5.052940  [  520/  637]
Test Error: 
 Avg loss: 4.889187 

Epoch 2
-------------------------------
loss: 4.208209  [   16/  637]
loss: 2.845092  [  520/  637]
Test Error: 
 Avg loss: 3.062905 

Epoch 3
-------------------------------
loss: 2.673915  [   16/  637]
loss: 1.772653  [  520/  637]
Test Error: 
 Avg loss: 4.264842 

Epoch 4
-------------------------------
loss: 1.925599  [   16/  637]
loss: 1.318423  [  520/  637]
Test Error: 
 Avg loss: 3.302973 

Epoch 5
-------------------------------
loss: 1.683722  [   16/  637]
loss: 1.734256  [  520/  637]
Test Error: 
 Avg loss: 2.114633 

Epoch 6
-------------------------------
loss: 1.459931  [   16/  637]
loss: 1.652546  [  520/  637]
Test Error: 
 Avg loss: 1.665875 

Epoch 7
-------------------------------
loss: 1.270940  [   16/  637]
loss: 1.485581  [  520/  637]
Test Error: 
 Avg loss: 0.873332 

Epoch 8
-------------------------------
loss: 1.432155 

In [37]:
hv.extension('matplotlib')

In [38]:
hv.HoloMap(
    {
        'training': hv.Curve(training_losses),
        'test': hv.Curve(test_losses)
    }
).overlay(
    
).opts(
    ylabel='MAE',
    xlabel='epoch'
)

### Evaluate

In [19]:
fit_imputer, fit_scaler = False, False

# Extract just the feature data
col_array = np.array(train_df.columns)
X = train_df[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].values

# Impute missing data
if imputer is not None:
    if fit_imputer:
        X = imputer.fit_transform(X)
    else:
        X = imputer.transform(X)

# Scale the feature data
if scaler is not None:
    if fit_scaler:
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

# Create features tensor
features = torch.tensor(X).to(torch.float32)

In [20]:
y = train_df['Y'].values.reshape(-1, 1)
target = torch.tensor(y).to(torch.float32)

In [21]:
pred = model.forward(features)

In [22]:
MAE = torch.nn.L1Loss()
MAE(pred, target)

tensor(0.1979, grad_fn=<MeanBackward0>)

In [23]:
fit_imputer, fit_scaler = False, False

# Extract just the feature data
col_array = np.array(val_df.columns)
X = val_df[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].values

# Impute missing data
if imputer is not None:
    if fit_imputer:
        X = imputer.fit_transform(X)
    else:
        X = imputer.transform(X)

# Scale the feature data
if scaler is not None:
    if fit_scaler:
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

# Create features tensor
features = torch.tensor(X).to(torch.float32)

In [24]:
y = val_df['Y'].values.reshape(-1, 1)
target = torch.tensor(y).to(torch.float32)

In [25]:
pred = model.forward(features)

In [26]:
MAE = torch.nn.L1Loss()
MAE(pred, target)

tensor(0.3359, grad_fn=<MeanBackward0>)

In [27]:
fit_imputer, fit_scaler = False, False

# Extract just the feature data
col_array = np.array(test_df.columns)
X = test_df[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].values

# Impute missing data
if imputer is not None:
    if fit_imputer:
        X = imputer.fit_transform(X)
    else:
        X = imputer.transform(X)

# Scale the feature data
if scaler is not None:
    if fit_scaler:
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

# Create features tensor
features = torch.tensor(X).to(torch.float32)

In [28]:
y = test_df['Y'].values.reshape(-1, 1)
target = torch.tensor(y).to(torch.float32)

In [29]:
pred = model.forward(features)

In [30]:
MAE = torch.nn.L1Loss()
MAE(pred, target)

tensor(0.3128, grad_fn=<MeanBackward0>)

### Leaderboard benchmark

In [31]:
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
benchmark = group.get('Caco2_Wang') 
# all benchmark names in a benchmark group are stored in group.dataset_names
name = benchmark['name']
train_data, test_data = benchmark['train_val'], benchmark['test']

Found local copy...


In [32]:
# make training dataloader
train_dataloader, imputer, scaler = construct_dataloader(add_descriptor_columns(train_data))

Calculating descriptors...


728it [00:29, 24.90it/s]


In [33]:
# make test data into correct format
fit_imputer, fit_scaler = False, False

test_data = add_descriptor_columns(test_data)

# Extract just the feature data
col_array = np.array(test_data.columns)
X_test = test_data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].values

# Impute missing data
if imputer is not None:
    if fit_imputer:
        X_test = imputer.fit_transform(X_test)
    else:
        X_test = imputer.transform(X_test)

# Scale the feature data
if scaler is not None:
    if fit_scaler:
        X_test = scaler.fit_transform(X_test)
    else:
        X_test = scaler.transform(X_test)

# Create features tensor
features_test = torch.tensor(X_test).to(torch.float32)

# Create target tensor
y_test = test_data['Y'].values.reshape(-1, 1)
target = torch.tensor(y_test).to(torch.float32)

Calculating descriptors...


182it [00:05, 32.98it/s]


In [34]:
predictions_list = []
importances_lists = []

for trial in tqdm.tqdm([1, 2, 3, 4, 5]):
    print(f"Trial {trial}:\n-------------------------------")
    
    model = NeuralNetwork().to(device)
    loss_fn = torch.nn.L1Loss() # MAE
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    epochs = 500
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
    print("Done!")

    y_pred_test = model.forward(features_test)
    y_pred_test = y_pred_test.detach().numpy()

    predictions = {}
    predictions[name] = y_pred_test
    predictions_list.append(predictions)

results = group.evaluate_many(predictions_list)
# {'caco2_wang': [6.328, 0.101]}

  0%|                                                     | 0/5 [00:00<?, ?it/s]

Trial 1:
-------------------------------
Epoch 1
-------------------------------
loss: 5.217315  [   16/  728]
loss: 5.690672  [  368/  728]
Epoch 2
-------------------------------
loss: 5.057850  [   16/  728]
loss: 5.377912  [  368/  728]
Epoch 3
-------------------------------
loss: 4.896375  [   16/  728]
loss: 172.008224  [  368/  728]
Epoch 4
-------------------------------
loss: 4.544419  [   16/  728]
loss: 3.576785  [  368/  728]
Epoch 5
-------------------------------
loss: 3.555260  [   16/  728]
loss: 2.195665  [  368/  728]
Epoch 6
-------------------------------
loss: 1.984303  [   16/  728]
loss: 1.757985  [  368/  728]
Epoch 7
-------------------------------
loss: 1.370661  [   16/  728]
loss: 1.705336  [  368/  728]
Epoch 8
-------------------------------
loss: 1.877199  [   16/  728]
loss: 1.966394  [  368/  728]
Epoch 9
-------------------------------
loss: 0.864912  [   16/  728]
loss: 2.525105  [  368/  728]
Epoch 10
-------------------------------
loss: 1.895245  

 20%|█████████                                    | 1/5 [00:19<01:16, 19.17s/it]

loss: 0.540964  [  368/  728]
Done!
Trial 2:
-------------------------------
Epoch 1
-------------------------------
loss: 4.896502  [   16/  728]
loss: 187.760025  [  368/  728]
Epoch 2
-------------------------------
loss: 3.635041  [   16/  728]
loss: 1.886816  [  368/  728]
Epoch 3
-------------------------------
loss: 2.304061  [   16/  728]
loss: 324.559967  [  368/  728]
Epoch 4
-------------------------------
loss: 1.769761  [   16/  728]
loss: 1.880480  [  368/  728]
Epoch 5
-------------------------------
loss: 1.365659  [   16/  728]
loss: 1.221524  [  368/  728]
Epoch 6
-------------------------------
loss: 1.574308  [   16/  728]
loss: 698.633728  [  368/  728]
Epoch 7
-------------------------------
loss: 1.481204  [   16/  728]
loss: 1907.968872  [  368/  728]
Epoch 8
-------------------------------
loss: 1.162121  [   16/  728]
loss: 1.056444  [  368/  728]
Epoch 9
-------------------------------
loss: 1.491834  [   16/  728]
loss: 2982.411865  [  368/  728]
Epoch 10
--

 40%|██████████████████                           | 2/5 [00:38<00:57, 19.17s/it]

loss: 0.389707  [  368/  728]
Done!
Trial 3:
-------------------------------
Epoch 1
-------------------------------
loss: 5.407807  [   16/  728]
loss: 5.168161  [  368/  728]
Epoch 2
-------------------------------
loss: 4.651819  [   16/  728]
loss: 393.726471  [  368/  728]
Epoch 3
-------------------------------
loss: 2.349666  [   16/  728]
loss: 689.372253  [  368/  728]
Epoch 4
-------------------------------
loss: 2.046422  [   16/  728]
loss: 2497.015137  [  368/  728]
Epoch 5
-------------------------------
loss: 1.496680  [   16/  728]
loss: 2.118964  [  368/  728]
Epoch 6
-------------------------------
loss: 1.462994  [   16/  728]
loss: 699.776733  [  368/  728]
Epoch 7
-------------------------------
loss: 1.341094  [   16/  728]
loss: 520.818054  [  368/  728]
Epoch 8
-------------------------------
loss: 1.666034  [   16/  728]
loss: 1423.583496  [  368/  728]
Epoch 9
-------------------------------
loss: 1.160135  [   16/  728]
loss: 944.887024  [  368/  728]
Epoch 1

 60%|███████████████████████████                  | 3/5 [00:57<00:38, 19.36s/it]

loss: 0.204302  [  368/  728]
Epoch 497
-------------------------------
loss: 0.257358  [   16/  728]
loss: 0.259067  [  368/  728]
Epoch 498
-------------------------------
loss: 0.210452  [   16/  728]
loss: 0.263939  [  368/  728]
Epoch 499
-------------------------------
loss: 0.201757  [   16/  728]
loss: 0.360133  [  368/  728]
Epoch 500
-------------------------------
loss: 0.223772  [   16/  728]
loss: 0.228195  [  368/  728]
Done!
Trial 4:
-------------------------------
Epoch 1
-------------------------------
loss: 5.088980  [   16/  728]
loss: 4548.611816  [  368/  728]
Epoch 2
-------------------------------
loss: 3.791866  [   16/  728]
loss: 623.543396  [  368/  728]
Epoch 3
-------------------------------
loss: 2.063714  [   16/  728]
loss: 2362.058594  [  368/  728]
Epoch 4
-------------------------------
loss: 1.629905  [   16/  728]
loss: 2240.039795  [  368/  728]
Epoch 5
-------------------------------
loss: 1.273923  [   16/  728]
loss: 2.031538  [  368/  728]
Epoc

 80%|████████████████████████████████████         | 4/5 [01:17<00:19, 19.54s/it]

loss: 0.397185  [  368/  728]
Epoch 498
-------------------------------
loss: 0.301112  [   16/  728]
loss: 0.170771  [  368/  728]
Epoch 499
-------------------------------
loss: 0.295820  [   16/  728]
loss: 0.166149  [  368/  728]
Epoch 500
-------------------------------
loss: 0.292695  [   16/  728]
loss: 0.140882  [  368/  728]
Done!
Trial 5:
-------------------------------
Epoch 1
-------------------------------
loss: 5.454250  [   16/  728]
loss: 5.728818  [  368/  728]
Epoch 2
-------------------------------
loss: 4.906533  [   16/  728]
loss: 1285.765137  [  368/  728]
Epoch 3
-------------------------------
loss: 3.734469  [   16/  728]
loss: 2.809154  [  368/  728]
Epoch 4
-------------------------------
loss: 2.347260  [   16/  728]
loss: 800.098267  [  368/  728]
Epoch 5
-------------------------------
loss: 2.157612  [   16/  728]
loss: 483.391876  [  368/  728]
Epoch 6
-------------------------------
loss: 1.776512  [   16/  728]
loss: 556.245178  [  368/  728]
Epoch 7


100%|█████████████████████████████████████████████| 5/5 [01:39<00:00, 19.88s/it]

loss: 0.253100  [  368/  728]
Epoch 500
-------------------------------
loss: 0.198855  [   16/  728]
loss: 0.266451  [  368/  728]
Done!





In [35]:
results

{'caco2_wang': [0.383, 0.023]}

### TODO
- regularization
- early stopping