## Configuration

In [16]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # set gpu

os.environ["PATH"] += ":/usr/local/openeye/bin"
os.environ["OE_LICENSE"] = "/home/fts_g_ucla_edu/Projects/oe_license.txt"

cwd = os.getcwd()
print('Working directory:', cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

Current conda environment: reinvent
Working directory: /home/fts_g_ucla_edu/Projects/rips-relay/experiments


In [17]:
import numpy as np

import pandas as pd

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolsToGridImage
import useful_rdkit_utils as uru
from rdkit.Chem.rdFMCS import FindMCS

import pickle

import mols2grid

## Generating Data

In [18]:
models = ['reinvent', 'coati', 'safe']
prefixes = ['2zdt', '2qd9', '2ojg']

In [19]:
reinvent_distributions, crem_distributions, coati_distributions, safe_distributions = [], [], [], []

for pdb in prefixes:
    for model in models:

        DF_FILEPATH = f'data/{model}_dataframe.csv'

        arg1 = '--model'
        arg2 = '--sample'
        arg3 = '--remove_odd_rings'
        arg4 = '--dock'
        arg5 = '--pdb'

        args = ['python3', 'generate_analogs.py',
                arg1, model,
                arg2, '200',
                arg3,
                arg4,
                arg5, pdb]

        # Change directory to generate analogs with python script
        %cd ..

        subprocess.run(args,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
                
        # Change directory back to that of the current notebook
        %cd experiments

        df = pd.read_csv(DF_FILEPATH, index_col=0)

        df['Model'] = model

        if model == 'reinvent':
            reinvent_distributions.append(df)
        elif model == 'crem':
            crem_distributions.append(df)
        elif model == 'coati':
            coati_distributions.append(df)
        elif model == 'safe':
            safe_distributions.append(df)

/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments
/home/fts_g_ucla_edu/Projects/rips-relay
/home/fts_g_ucla_edu/Projects/rips-relay/experiments


In [25]:
data = {'reinvent' : reinvent_distributions,
        'crem' : crem_distributions,
        'coati' : coati_distributions,
        'safe' : safe_distributions}

In [26]:
with open('lists_docking.pkl', 'wb') as file:
    pickle.dump(data, file)

In [27]:
for d in data:
    for df in data[d]:
        print(len(df))

198
198
198
66
66
66
195
128
166


## Defining Utils

In [22]:
def remove_odd_rings(df):

    ring_system_lookup = uru.RingSystemLookup.default()
    df['ring_systems'] = df.SMILES.apply(ring_system_lookup.process_smiles)
    df[['min_ring','min_freq']] = df.ring_systems.apply(uru.get_min_ring_frequency).to_list()

    odd_rings = df['min_freq'] < 100

    df['Odd rings'] = ~ odd_rings

    df = df[df['Odd rings']].drop(columns=['Odd rings'])
    
    return df.loc[:, ['SMILES', 'Input_SMILES', 'Model']]

In [23]:
'''
Removes distributions with zero molecules

Removes invalid SMILES strings

Removes odd ring systems from distributions
'''
def preprocess_data(data):

    for d in data:
        
        for i, df in enumerate(data[d]):
            
            if df.empty:
                pass
            else:
                smiles_list = df['SMILES'].to_list()

                valid_smiles = []
                invalid_smiles = []

                for smiles in smiles_list:
                        
                    try:
                        molecule = Chem.MolFromSmiles(smiles, sanitize=True)
                        if molecule is not None:
                            valid_smiles.append(True)
                        else:
                            valid_smiles.append(False)
                    except Exception as e:
                        invalid_smiles.append(smiles)

                df = remove_odd_rings(df)

                data[d][i] = df
            
        filtered_df_list = [df for df in data[d] if len(df) > 0]
    
        data[d] = filtered_df_list
                
    
    return data

## Data Preprocessing

In [24]:
# data = preprocess_data(data)

## Docking