## Checkout Polaris and tdc benchmarks and try it out.

So goal of this notebook is to write some code that can extract all the classification datasets from the polaris hub and take the training and test sets so I can use them in the ftf project

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import polaris as po
from polaris.hub.client import PolarisHubClient, PolarisFileSystem

In [2]:
# Single task classification benchmarks

benchmarks = [
    "polaris/pkis2-egfr-wt-c-1",
    "polaris/pkis2-ret-wt-c-1",
    "polars/pkis2-kit-wt-c-1",
    "polars/pkis2-kit-wt-cls-v2",
    "polars/pkis2-ret-wt-cls-v2",
    "tdcommons/ames",
]

In [3]:
benchmark = po.load_benchmark(benchmarks[-1])

⠼ Fetching dataset... . 

[32m2025-01-15 15:42:39.398[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (dev) is different from the currently installed version of Polaris (0.7.9).[0m
  self._color = self._set_color(value) if value else value
[32m2025-01-15 15:42:39.408[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (dev) is different from the currently installed version of Polaris (0.7.9).[0m


✅ SUCCESS: [1mFetched dataset.[0m
 
✅ SUCCESS: [1mFetched benchmark.[0m
 


In [4]:
# Load and split the data
train, test = benchmark.get_train_test_split()

In [5]:
train_df = pd.DataFrame([train.inputs, train.targets.astype(int)]).T
train_df.columns = ["smiles", "labels"]

In [6]:
test_df = pd.DataFrame([test.inputs], index=["smiles"]).T

### Tryout a simple model on the benchmark

JUST TO UNDERSTAND HOW TO PARSE THE RESULTS

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import AllChem, DataStructs

In [9]:
# Disable rdkit logger
RDLogger.DisableLog('rdApp.*') 

In [30]:
def compute_morgan(smile, radius):
    """
    Function to compute morgan fingerprints for a list of smiles.
    """
    molecule = Chem.MolFromSmiles(smile)
    fp_object = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=2048)
    morgan_fp = np.zeros((0, ))
    DataStructs.ConvertToNumpyArray(fp_object, morgan_fp)

    return morgan_fp


In [11]:
X_train = np.array([compute_morgan(smile, 2) for smile in tqdm(train_df["smiles"])]).reshape(-1, 1)
y_train = train_df["labels"].values.astype(int)

  0%|          | 0/5821 [00:00<?, ?it/s]

In [12]:
X_test = np.array([compute_morgan(smile, 2) for smile in tqdm(test_df["smiles"])]).reshape(-1, 1)

  0%|          | 0/1457 [00:00<?, ?it/s]

In [None]:
### Train the model
# rf = RandomForestClassifier(n_estimators=300, max_depth=64, random_state=0)
# rf.fit(X_train, y_train)

In [None]:
# Load trained model
import pickle
with open("../../eGuard/run_pipeline/results/trained_models/ames.pkl", "rb") as f:
    rf = pickle.load(f)

In [40]:
# Generate morgan fingerprints for test data
X_test = np.array([compute_morgan(smile, 3) for smile in tqdm(test_df["smiles"])])

  0%|          | 0/1457 [00:00<?, ?it/s]

In [41]:
# Predict the test set and check benchmark score
predictions = rf.predict(X_test)
proba = rf.predict_proba(X_test)[:,1]

In [42]:
results = benchmark.evaluate(y_pred=predictions, y_prob=proba)

In [43]:
results_df = results.results

In [44]:
metrics_name = [results_df.Metric.tolist()[i].__repr__().split(".")[1].split(":")[0] for i in range(len(results_df))]

In [45]:
results_df.index = metrics_name

In [46]:
results_df

Unnamed: 0,Test set,Target label,Metric,Score
roc_auc,test,Y,Metric.roc_auc,0.836582
