In [2]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # set gpu

cwd = os.getcwd()
print('Working directory:', cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

Current conda environment: reinvent
Working directory: /home/fts_g_ucla_edu/Projects/rips-relay/experiments


In [3]:
import numpy as np
from numpy.linalg import norm

import pandas as pd

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit import DataStructs
import useful_rdkit_utils as uru

import pickle

from molscore import MolScore
from moleval.metrics.metrics import GetMetrics

from fcd import get_fcd

import medchem as mc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

In [4]:
#Seaborn settings for visualizations

rc = {
    "axes.facecolor": "#f7f9fc",
    "figure.facecolor": "#f7f9fc",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

default_palette = 'tab10'

sns.set(rc=rc)
pd.set_option('display.max_columns', 35)
pd.options.display.float_format = '{:,.2f}'.format

In [5]:
fragments = []

f = open("data/fragments.smi", "r")

for i in range(1, 51):
    mol = f.readline()
    fragments.append(mol[:-1])
    

In [6]:
# Set pandas display options to improve readability

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 2)

In [7]:
with open('lists.pkl', 'rb') as file:
    data = pickle.load(file)

In [8]:
print(len(data["safe"]))

50


In [9]:

models = ['reinvent', 'crem', 'coati', 'safe']

In [10]:
model_dfs = []

for model in models:

    distributions = data[model]
    model_df = pd.DataFrame()

    for df in distributions:

        model_df = pd.concat((model_df, df))

    model_dfs.append(model_df)

In [11]:
for i, df in enumerate(model_dfs):

    smiles_list = df['SMILES'].to_list()

    valid_smiles = []
    invalid_smiles = []

    for smiles in smiles_list:
        
        try:
            molecule = Chem.MolFromSmiles(smiles, sanitize=True)
            if molecule is not None:
                valid_smiles.append(True)
            else:
                valid_smiles.append(False)
        except Exception as e:
            invalid_smiles.append(smiles)
        
    model_dfs[i] = df[valid_smiles]

In [12]:
for i, (model, df) in enumerate(zip(models, model_dfs)):

    mols = [Chem.MolFromSmiles(smi) for smi in df.SMILES]

    df['ROMol'] = mols

    df['inchi'] = df.ROMol.apply(Chem.MolToInchiKey)

    duplicates = df.drop_duplicates(subset="inchi")

    df.drop_duplicates(subset="inchi", inplace=True)
    
    model_dfs[i] = df 

In [13]:
print(len(model_dfs[0]), len(model_dfs[1]), len(model_dfs[2]), len(model_dfs[3]))

9102 8674 5068 8042


In [14]:
def remove_odd_rings(df):

    ring_system_lookup = uru.RingSystemLookup.default()
    df['ring_systems'] = df.SMILES.apply(ring_system_lookup.process_smiles)
    df[['min_ring','min_freq']] = df.ring_systems.apply(uru.get_min_ring_frequency).to_list()

    odd_rings = df['min_freq'] < 100

    df['Odd rings'] = ~ odd_rings

    df = df[df['Odd rings']].drop(columns=['Odd rings'])
    
    return df

In [15]:
for d in data:

    for i, df in enumerate(data[d]):

        df['Input Fragment'] = fragments[i]

In [16]:
for i, df in enumerate(model_dfs):

    model_dfs[i] = remove_odd_rings(df)

In [17]:
print(len(model_dfs[0]), len(model_dfs[1]), len(model_dfs[2]), len(model_dfs[3]))

8744 7481 4169 7306


In [18]:
for i, df in enumerate(model_dfs):

    model_dfs[i] = df.drop(columns=['ring_systems', 'min_ring', 'min_freq'])
    #sample
    model_dfs[i] = df.sample(n=2000, random_state=42)

In [19]:
#read in smile strings from chembl smi dataset into a list
chembl = []
drugs = []

with open('data/chembl_sample.smi', 'r') as f:
    for line in f:
        chembl.append(line.split()[0])

with open('data/chembl_drugs.smi', 'r') as f:
    for line in f:
        drugs.append(line.split()[0])
        
chembl = chembl[1:]
drugs = drugs[1:]

In [20]:
from rdkit.Chem import QED

In [21]:
#define a function that calculated the QED values of a list of smile strings

def calculate_qed(smiles_list):
    qed_values = []
    for smiles in smiles_list:
        molecule = Chem.MolFromSmiles(smiles)
        qed = QED.qed(molecule)
        qed_values.append(qed)
    return qed_values

In [None]:
#