In [46]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])
os.environ['TOKENIZERS_PARALLELISM'] = "false"

cwd = os.getcwd()
print('Working directory:', cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

Current conda environment: reinvent
Working directory: /home/fts_g_ucla_edu/Projects/rips-relay/experiments


In [74]:
import pandas as pd

from rdkit import Chem

from molscore import MolScore
from moleval.metrics.metrics import GetMetrics

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, roc_auc_score, RocCurveDisplay

from xgboost import XGBClassifier

import pickle

In [17]:
fragments = []

f = open("data/fragments.smi", "r")

for i in range(100):
    mol = f.readline()
    fragments.append(mol[:-1])

In [18]:
fragments[:5]

['C1c2cncnc2NC1',
 'Cc1n[nH]c(N)c1C#N',
 'NCc1cc(Br)cnc1',
 'CC(Nc1nccs1)=O',
 'Cc1c(CO)cncc1']

### Generate a distribution from each fragment for each model

In [19]:
reinvent_distributions, crem_distributions, coati_distributions, safe_distributions = [], [], [], []
models = ['reinvent', 'crem', 'coati', 'safe']

In [26]:
%%capture

for fragment in fragments[:5]:
    for model in models:

        DF_FILEPATH = f'data/{model}_dataframe.csv'

        arg1 = '--model'
        arg2 = '--input_frag'
        arg3 = '--sample'

        args = ['python3', 'generate_analogs.py',
                arg1, model,
                arg2, fragment,
                arg3, '50']

        # Change directory to generate analogs with python script
        %cd ..

        subprocess.run(args,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
                
        # Change directory back to that of the current notebook
        %cd experiments

        df = pd.read_csv(DF_FILEPATH, index_col=0)

        df['Model'] = model

        if model == 'reinvent':
            reinvent_distributions.append(df)
        elif model == 'crem':
            crem_distributions.append(df)
        elif model == 'coati':
            coati_distributions.append(df)
        elif model == 'safe':
            safe_distributions.append(df)

In [49]:
data = {'reinvent' : reinvent_distributions,
        'crem' : crem_distributions,
        'coati' : coati_distributions,
        'safe' : safe_distributions}

In [50]:
with open('lists.pkl', 'wb') as file:
    pickle.dump(data, file)

In [51]:
with open('lists.pkl', 'rb') as file:
    data = pickle.load(file)

In [56]:
reinvent_distributions = data['reinvent']
crem_distributions = data['crem']
coati_distributions = data['coati']
safe_distributions = data['safe']

In [71]:
smiles = reinvent_distributions[0]['SMILES'].to_list()

In [72]:
smiles

['O=[N+]([O-])c1ccc2c(n1)C(O)CN2',
 'CC(C)Oc1ncnc2c1CNC2',
 'Nc1nc2c(nc1Br)CCN2',
 'Nc1nc(Cl)c2c(n1)CNCC2',
 'Cc1nc(N)c2c(n1)CCN2C',
 'Cc1nc2c(nc1Cl)NCC2',
 'CN1CCc2nc(N)c3ncc(N)nc3c2C1',
 'Brc1ccc2[nH]ncc2n1',
 'CC(C)[C@H]1Cc2cncnc2N1',
 'CC1CNc2ncncc2C1',
 'CCC1Cc2cncnc2N1',
 'Nc1cc2c(cn1)CNCC2',
 'CCCNc1nc2c(nc1O)CCN2',
 'c1ncc2c(n1)NC1(CC2)CC1',
 'CC1CNc2cncnc21',
 'c1ncc2c(n1)NC1CCCCC21',
 'O=[N+]([O-])c1cc2c(cn1)C(O)CN2',
 'CN1CCc2cncnc21',
 'c1c[nH]c(-c2ncc3c(n2)NCC3)c1',
 'CC1CNc2ncncc2N1',
 'N#Cc1ncc2c(n1)NCC2',
 'CC1Cc2nc(N)cnc2N1',
 'CC1CCCNc2ncncc21',
 'Nc1nc(N)c2c(n1)CCN2',
 'CC1Cc2nc(N)nc(N)c2N1',
 'O=[N+]([O-])c1cc2c(cn1)CNCC2',
 'CNc1nc2c(nc1C)NCC2',
 'C[C@@H]1Cc2cnc(N)nc2N1',
 'Brc1ncnc2c1CCN2',
 'CC1CCNc2nccnc21',
 'Clc1ccc2[nH]cnc2n1',
 'c1ncc2c(n1)OCCCC2',
 'CCN1CCc2cncnc21',
 'Brc1ncc2c(n1)CCN2',
 'c1cnc2c(c1)CCN2',
 'CC1(C)CNc2ncncc21',
 'COC1CNc2nccnc2CN1',
 'c1cnc2c(n1)CCN2',
 'C[C@@H]1CNc2cncnc2N1',
 'c1ncc2c(n1)OCCC2',
 'O=[N+]([O-])c1ccc2[nH]ncc2c1',
 'c1ccc(

In [75]:
MetricEngine = GetMetrics(
    n_jobs=1,
    device='cpu',
    batch_size=512,
    test=None,
    train=None,
    target=None,
)
metrics = MetricEngine.calculate(
    smiles,
    calc_valid=True,
    calc_unique=True,
    unique_k=10000,
    se_k=1000,
    sp_k=1000,
    properties=True,
)

Cleaning up reference smiles


FileNotFoundError: [Errno 2] No such file or directory: '/home/fts_g_ucla_edu/.local/lib/python3.10/site-packages/moleval/metrics/fcd_torch/ChemNet_v0.13_pretrained.pt'

In [1]:
from molscore import MockGenerator
from moleval.metrics.metrics import GetMetrics

mg = MockGenerator()
GEN_SMILES = mg.sample(50)
TRAIN_SMILES = mg.sample(500)
TEST_SMILES = mg.sample(20)
TARGET_SMILES = mg.sample(20)

MetricEngine = GetMetrics(
    n_jobs=1,
    device='cpu',
    batch_size=512,
    run_fcd=False,
    test=TEST_SMILES,
    train=TRAIN_SMILES,
    target=TARGET_SMILES,
)
metrics = MetricEngine.calculate(
    GEN_SMILES,
    calc_valid=True,
    calc_unique=True,
    unique_k=10000,
    se_k=1000,
    sp_k=1000,
    properties=True,
)

Cleaning up reference smiles
Computing test pre-statistics


FileNotFoundError: [Errno 2] No such file or directory: '/home/fts_g_ucla_edu/.local/lib/python3.10/site-packages/moleval/metrics/NP_Score/publicnp.model.gz'