In [1]:
import os
import json
from collections import defaultdict
import random
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import deepchem as dc
from deepchem.models import GCNModel

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
ch = "GO:0098869"
ch_dir = ch[:2] + ch[3:]

In [5]:
directory = f'pqqgnn/{ch_dir}'
if not os.path.exists(directory):
    os.makedirs(directory)

dir_metrics = f'pqqgnn/{ch_dir}/metrics'
if not os.path.exists(dir_metrics):
    os.makedirs(dir_metrics)

def json_serializable(item):
    if isinstance(item, np.floating):
        return float(item)
    elif isinstance(item, np.integer):
        return int(item)
    elif isinstance(item, np.ndarray):
        return item.tolist()
    else:
        return item

In [6]:
def evaluate_metrics(dataset):
    y_true = dataset.y
    y_pred = model.predict(dataset)
    y_pred_binary = (y_pred[:, 1] > 0.5).astype(int)
    
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred[:, 1])
    
    # Get the data needed for plotting the ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_pred[:, 1])

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred_binary)
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'ROC Curve Data': {'FPR': fpr, 'TPR': tpr, 'Thresholds': thresholds},
        'Confusion Matrix': cm
    }

In [7]:
df = pd.read_csv("pqqgnn/raw/train.csv")
df

Unnamed: 0,SMILES,WP:3844,GO:0000165,GO:0004896,KEGG:hsa04064,KEGG:hsa04210,KEGG:hsa04630,GO:0016209,GO:0098869,GO:0072593,GO:0006281
0,C1=CC=C(C(=C1)C2=NC3=CC=CC=C3C(=O)N2)C(=O)O,1,0,0,0,0,0,1,0,0,0
1,CC1=NC2=C(N1)C(=CC(=N2)C3=C(C=CC(=C3)OC)OC)C(=O)O,1,0,0,0,0,0,1,0,0,0
2,COC1=CC=C(C=C1)C2=C3C(=CC(=NC3=NN2)C4=CC(=C(C(...,1,0,0,0,0,0,0,0,0,0
3,C1COCCN1C2=NC3=C(C(=N2)C4=CC(=CC=C4)O)N=C(C=C3...,1,0,0,0,0,0,0,0,0,0
4,CC1=CC(=NC2=C1N=C(N=C2C3=CC(=CC=C3)O)N4CCOCC4)...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
11537,N.Cl[Pt]Cl,0,0,0,0,0,0,0,0,0,1
11538,N.Cl[TeH]1(Cl)(Cl)OCCO1,0,0,1,0,1,0,0,0,0,0
11539,NC12CC3CC(CC(C3)C1)C2,0,0,0,0,1,0,0,0,0,0
11540,NCCCCNCCCN,0,0,0,0,1,0,1,0,0,0


In [8]:
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold


df_minority = df[df[ch]==1]
df_majority = df[df[ch]==0]

# Resample the majority class to match the minority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # to match minority class
                                   random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

df_balanced

Unnamed: 0,SMILES,WP:3844,GO:0000165,GO:0004896,KEGG:hsa04064,KEGG:hsa04210,KEGG:hsa04630,GO:0016209,GO:0098869,GO:0072593,GO:0006281
11096,OS(O)(=O)=O.OS(O)(=O)=O.CN[C@@H]1[C@@H](O)[C@@...,0,0,0,0,1,0,0,0,0,0
5104,O=C(OCc1ccccc1)c1ccccc1,0,0,0,0,1,0,0,0,0,0
4229,Oc1cc2ccccc2cc1C([O-])=O.C[N+](C)(CCOc1ccccc1)...,0,0,0,0,1,0,0,0,0,0
3709,C1=CC=C(C(=C1)NC(=O)NC2=CC=CC(=C2)C(=O)O)Cl,0,0,0,0,0,1,1,0,0,0
4772,OC(=O)C(=C\c1ccc(O)c(O)c1)\C#N,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4405,CCNC(=O)NC1=NC=C2C(=C1)NN=C2C3=CC=C(C=C3)F,0,0,0,0,0,0,0,1,0,0
4421,COC1=CC=CC=C1C2=NN=C(O2)SCC(=O)NC3=CC=CC=C3,0,0,0,0,0,0,0,1,0,0
4435,C1=CC=C(C=C1)C(CO)NC(=O)C2=CC=C(C=C2)C3=CC=NC=C3,0,0,0,0,0,0,0,1,0,0
4436,C1=CC=C(C=C1)C2=CSC(=N2)NC(=O)C3=CC=CS3,0,0,0,0,0,0,0,1,0,0


In [9]:
df = df_balanced

In [10]:
df_test = pd.read_csv("pqqgnn/raw/test.csv")
df_test

Unnamed: 0,cid,SMILES
0,19,C1=CC(=C(C(=C1)O)O)C(=O)O
1,127,C1=CC(=CC=C1CC(=O)O)O
2,177,CC=O
3,247,C[N+](C)(C)CC(=O)[O-]
4,264,CCCC(=O)O
...,...,...
203,157009725,COC1=CC(=CC(=C1O)O)C2=[O+]C3=CC(=CC(=C3C=C2OC4...
204,157009726,COC1=CC(=CC(=C1O)OC)C2=[O+]C3=CC(=CC(=C3C=C2OC...
205,157009736,C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)O[C@@H]2[C...
206,157009738,C1=C(C(=C(C(=C1SCC(C(=O)NCC(=O)O)NC(=O)CCC(C(=...


In [11]:
X_feed = df["SMILES"].values
X_feed

array(['OS(O)(=O)=O.OS(O)(=O)=O.CN[C@@H]1[C@@H](O)[C@@H](O[C@H]2[C@H](N)C[C@H](N)[C@@H](O[C@H]3O[C@H]([C@@H](C)O)[C@@H](O)[C@H](O)[C@H]3N)[C@@H]2O)OC[C@]1(C)O',
       'O=C(OCc1ccccc1)c1ccccc1',
       'Oc1cc2ccccc2cc1C([O-])=O.C[N+](C)(CCOc1ccccc1)Cc1ccccc1', ...,
       'C1=CC=C(C=C1)C(CO)NC(=O)C2=CC=C(C=C2)C3=CC=NC=C3',
       'C1=CC=C(C=C1)C2=CSC(=N2)NC(=O)C3=CC=CS3',
       'C1=CC=C2C(=C1)C=CC3=C2C=CC4=C3C(=O)C=CC4=O'], dtype=object)

In [12]:
y = df[ch].values
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [13]:
X_test_feed = df_test["SMILES"].values
X_test_feed

array(['C1=CC(=C(C(=C1)O)O)C(=O)O', 'C1=CC(=CC=C1CC(=O)O)O', 'CC=O',
       'C[N+](C)(C)CC(=O)[O-]', 'CCCC(=O)O', 'C(CCN)CCN', 'C(=O)O',
       'C[N+](C)(C)CCO', 'C1=CC=C(C(=C1)C(=O)O)O',
       'C1=C(C=C(C(=C1O)O)O)C(=O)O', 'CCCCCCCC(=O)O', 'CNC', 'CCO',
       'C(C(CO)O)O', 'C1CCNC(C1)C(=O)O', 'C1(C(C(C(C(C1O)O)O)O)O)O',
       'C1=CC(=CN=C1)C(=O)O', 'C(=O)(C(=O)O)O', 'CCCCCCCCCCCCCCCC(=O)O',
       'C1=C(C2=C(C(=O)C(=O)C3=C2NC(=C3)C(=O)O)N=C1C(=O)O)C(=O)O', 'CCCO',
       'C(CCN)CN', 'CC1=NC=C(C(=C1O)CO)CO', 'CC(=O)C(=O)O',
       'CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCO', 'CN(C)C',
       'COC1=C(C=CC(=C1)C=O)O', 'CCCCCCCCCC(=O)O', 'CCCCCCCCCCCC(=O)O',
       'CCCCCCCCCCCCCCCCCC(=O)O', 'C[N+]1=CC=CC(=C1)C(=O)[O-]',
       'C([C@H]([C@H]([C@@H]([C@H](CO)O)O)O)O)O',
       'C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O',
       'C[C@]12CC[C@H]3[C@H]([C@@H]1CCC2=O)CCC4=C3C=CC(=C4)O',
       'C(CC(=O)N)[C@@H](C(=O)O)N',
       'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O)CCC4=C3C=CC(=C4)O',
    

In [14]:
# Featurize the data using DeepChem's MolGraphConvFeaturizer
featurizer = dc.feat.MolGraphConvFeaturizer()
X_featurized = featurizer.featurize(X_feed)



In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(skf.split(X_featurized, y)):
    X_train, X_val = X_featurized[train_index], X_featurized[val_index]
    y_train, y_val = y[train_index], y[val_index]

    train_dataset = dc.data.NumpyDataset(X=X_train, y=y_train)
    val_dataset = dc.data.NumpyDataset(X=X_val, y=y_val)

    model_dir = f'pqqgnn/{ch_dir}/model_fold_{fold}'
    model = GCNModel(
        model_dir=model_dir,
        n_tasks=1,  # Number of tasks is 1 for binary classification
        graph_conv_layers=[64, 64],  # Number of graph convolution layers; adjust as needed
        activation=None,  # Activation function; default is None
        residual=True,  # Whether to include residual connections; default is True
        batchnorm=True,  # Whether to use batch normalization; default is False
        dropout=0.0,  # Dropout rate; default is 0.0
        predictor_hidden_feats=128,  # Number of hidden units in the dense layer; default is 128
        predictor_dropout=0.5,  # Dropout rate in the dense layer; default is 0.0
        mode='classification',  # Mode is 'classification' for binary classification
        number_atom_features=30,  # Number of atom features; default is 30
        n_classes=2,  # Number of classes is 2 for binary classification
        self_loop=True,  # Whether to include self loops in the graph; default is True
        device=device  # Use GPU if available; default is None (no GPU)
    )
    fold_train_metrics_list = []
    fold_val_metrics_list = []

    for epoch in range(20):
        loss = model.fit(train_dataset, nb_epoch=1)
        
        train_metrics = evaluate_metrics(train_dataset)
        val_metrics = evaluate_metrics(val_dataset)

        fold_train_metrics_list.append(train_metrics)
        fold_val_metrics_list.append(val_metrics)

        # Print running metrics
        print(f"Fold {fold + 1}, Epoch {epoch + 1}, Loss: {loss}")
        print("Training Metrics: ", train_metrics["Accuracy"], train_metrics["Precision"], train_metrics["Recall"], train_metrics["F1 Score"], train_metrics["ROC AUC"])
        print("Validation Metrics: ", val_metrics["Accuracy"], val_metrics["Precision"], val_metrics["Recall"], val_metrics["F1 Score"], val_metrics["ROC AUC"])

    # Save metrics for this fold
    with open(os.path.join(dir_metrics, f'train_metrics_fold_{fold}.json'), 'w') as f:
        json.dump(fold_train_metrics_list, f, default=json_serializable)
    
    with open(os.path.join(dir_metrics, f'val_metrics_fold_{fold}.json'), 'w') as f:
        json.dump(fold_val_metrics_list, f, default=json_serializable)

    model.save_checkpoint(max_checkpoints_to_keep=1)

  assert input.numel() == input.storage().size(), (


Fold 1, Epoch 1, Loss: 0.5396005967084099
Training Metrics:  0.6977567886658795 0.6283524904214559 0.9681227863046045 0.7620817843866171 0.9034985621869812
Validation Metrics:  0.6839622641509434 0.6196319018404908 0.9528301886792453 0.7509293680297399 0.8936009255962977
Fold 1, Epoch 2, Loss: 0.3747994759503533
Training Metrics:  0.7857142857142857 0.7093425605536332 0.9681227863046045 0.818771842236645 0.9440911669633361
Validation Metrics:  0.7759433962264151 0.7038327526132404 0.9528301886792453 0.8096192384769539 0.9384789960840157
Fold 1, Epoch 3, Loss: 0.31049296435187845
Training Metrics:  0.845926800472255 0.7889546351084813 0.9445100354191264 0.8597528210639441 0.9557407280923435
Validation Metrics:  0.8443396226415094 0.7874015748031497 0.9433962264150944 0.8583690987124463 0.9508054467782129
Fold 1, Epoch 4, Loss: 0.3014644734999713
Training Metrics:  0.8884297520661157 0.8721719457013575 0.910271546635183 0.8908145580589254 0.965231827311896
Validation Metrics:  0.88443396

In [16]:
train_dataset_full = dc.data.NumpyDataset(X=X_featurized, y=y)

In [17]:
model_full = GCNModel(
    model_dir=f'pqqgnn/{ch_dir}/model-full',
    n_tasks=1,  # Number of tasks is 1 for binary classification
    graph_conv_layers=[64, 64],  # Number of graph convolution layers; adjust as needed
    activation=None,  # Activation function; default is None
    residual=True,  # Whether to include residual connections; default is True
    batchnorm=True,  # Whether to use batch normalization; default is False
    dropout=0.0,  # Dropout rate; default is 0.0
    predictor_hidden_feats=128,  # Number of hidden units in the dense layer; default is 128
    predictor_dropout=0.5,  # Dropout rate in the dense layer; default is 0.0
    mode='classification',  # Mode is 'classification' for binary classification
    number_atom_features=30,  # Number of atom features; default is 30
    n_classes=2,  # Number of classes is 2 for binary classification
    self_loop=True,  # Whether to include self loops in the graph; default is True
    device=device  # Use GPU if available; default is None (no GPU)
)

In [18]:
model_full.fit(train_dataset_full, nb_epoch=20)

0.16856104135513306

In [19]:
# Featurize the test dataset
X_test = featurizer.featurize(X_test_feed)

# Create a NumpyDataset object
test_dataset = dc.data.NumpyDataset(X=X_test)

# Make predictions
predictions = model_full.predict(test_dataset)

y_pred_binary = (predictions[:, 1] > 0.5).astype(int)
y_pred_binary

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [20]:
df_0098869 = pd.DataFrame({'CID': df_test["cid"], ch: y_pred_binary})
df_0098869

Unnamed: 0,CID,GO:0098869
0,19,0
1,127,0
2,177,0
3,247,0
4,264,0
...,...,...
203,157009725,0
204,157009726,0
205,157009736,0
206,157009738,0


In [24]:
df_0098869.to_csv(f'pqqgnn/{ch_dir}/predictions.csv', index=False)

In [29]:
print(model_full.model)

GCN(
  (model): GCNPredictor(
    (gnn): GCN(
      (gnn_layers): ModuleList(
        (0): GCNLayer(
          (graph_conv): GraphConv(in=30, out=64, normalization=none, activation=<function relu at 0x00000251518CE4D0>)
          (dropout): Dropout(p=0.0, inplace=False)
          (res_connection): Linear(in_features=30, out_features=64, bias=True)
          (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): GCNLayer(
          (graph_conv): GraphConv(in=64, out=64, normalization=none, activation=<function relu at 0x00000251518CE4D0>)
          (dropout): Dropout(p=0.0, inplace=False)
          (res_connection): Linear(in_features=64, out_features=64, bias=True)
          (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
    )
    (readout): WeightedSumAndMax(
      (weight_and_sum): WeightAndSum(
        (atom_weighting): Sequential(
          (0): Linear(in_featu