In [None]:
import os
import json
from collections import defaultdict
import random
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import deepchem as dc
from deepchem.models import GCNModel
import optuna
from optuna.pruners import SuccessiveHalvingPruner

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
ch = "GO:0006281"
ch_dir = ch.replace(":", "")

In [None]:
def evaluate_metrics(dataset, model):
    y_true = dataset.y
    y_pred = model.predict(dataset)
    y_pred_binary = (y_pred[:, 1] > 0.5).astype(int)
    
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    roc_auc = roc_auc_score(y_true, y_pred[:, 1])
    
    # Get the data needed for plotting the ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_pred[:, 1])
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'ROC Curve Data': {'FPR': fpr, 'TPR': tpr, 'Thresholds': thresholds}
    }

In [None]:
df = pd.read_csv("pqqgnn/raw/training0827v1-remove.csv")
df

In [None]:
from sklearn.utils import resample


df_minority = df[df[ch]==1]
df_majority = df[df[ch]==0]

# Resample the majority class to match the minority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # to match minority class
                                   random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

df_balanced

In [None]:
df = df_balanced

In [None]:
X_feed = df["SMILES"].values
X_feed

In [None]:
y = df[ch].values
y

In [None]:
X_test_feed = df_test["SMILES"].values
X_test_feed

In [None]:
featurizer = dc.feat.MolGraphConvFeaturizer()
X_featurized = featurizer.featurize(X_feed)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_featurized, y, test_size=0.2, stratify=y, random_state=42)

train_dataset = dc.data.NumpyDataset(X=X_train, y=y_train)
val_dataset = dc.data.NumpyDataset(X=X_val, y=y_val)

In [None]:
def objective(trial):
    # 1. Define hyperparameters using trial object
    dropout_rate = trial.suggest_float("dropout_rate", 0, 0.5)
    predictor_dropout = trial.suggest_float("predictor_dropout", 0, 0.5)
    predictor_hidden_feats = trial.suggest_int("predictor_hidden_feats", 32, 256, log=True)
    
    n_layers = trial.suggest_int("n_layers", 1, 3)
    graph_conv_layers = [trial.suggest_int(f"layer_{i}_units", 32, 128) for i in range(n_layers)]
    
    # 2. Create the GCN model with the suggested hyperparameters
    model = GCNModel(
        model_dir=f'tmp_model_{trial.number}',
        n_tasks=1,
        graph_conv_layers=graph_conv_layers,
        activation=None,
        residual=True,
        batchnorm=True,
        dropout=dropout_rate,
        predictor_hidden_feats=predictor_hidden_feats,
        predictor_dropout=predictor_dropout,
        mode='classification',
        number_atom_features=30,
        n_classes=2,
        self_loop=True,
        device=device
    )
    
    # 3. Train the model for 20 epochs (or until early stopping criterion is met)
    for epoch in range(10):
        model.fit(train_dataset, nb_epoch=1)
        
        # Evaluate and report the validation performance for potential early stopping
        val_metrics = evaluate_metrics(val_dataset, model)
        trial.report(val_metrics['ROC AUC'], epoch)
        
        # Handle pruning
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return val_metrics['ROC AUC']

In [None]:
# Define the study
pruner = SuccessiveHalvingPruner()
study = optuna.create_study(direction='maximize', pruner=pruner, sampler=optuna.samplers.TPESampler())

# Optimize the study, the objective function is passed in as the first argument
study.optimize(objective, n_trials=150)

In [None]:
# Print the result
best_params = study.best_params
best_score = study.best_value
print(f"Best parameters: {best_params}")
print(f"Best ROC AUC: {best_score}")