# Demo

This notebook is a demonstration of how to use FRESCO to score molecules. It is recommended to run this notebook in a conda environment created from the `environment.yml` file in the root directory of the repo.

# Load fragment complexes

In [1]:
import pickle
from rdkit import Chem
from rdkit.Chem import SaltRemover
from os import listdir

mpro_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/Mpro'

frags = pickle.load(open(mpro_dir + '/frags_mpro.pickle', 'rb'))
frag_mols = []
for mol in frags:
    m = Chem.RemoveHs(mol[0])
    frag_mols.append(m)

filenames = [ f for f in listdir('data/Mpro/frags/') if f[-5:]=='0.pdb']
df = pd.read_csv('data/Mpro/frags/hits_summary.csv')
remover = SaltRemover.SaltRemover()
df['smiles'] = [Chem.MolToSmiles(remover.StripMol(Chem.RemoveHs(Chem.MolFromSmiles(smi)))) for smi in df['Compound SMILES']]

w = Chem.SDWriter("mpro_frags.sdf")
for m in frag_mols:
    mpro_id = df.query('smiles == @Chem.MolToSmiles(@m)')['Dataset'].values
    m.SetProp("name", mpro_id[0]) # this is a specific value which is different per conformer
    w.write(m)
w.close()


INFO:rdkit:Enabling RDKit 2022.03.1 jupyter extensions


In [2]:
import pandas as pd
from rdkit.Chem import PandasTools

sdfFile = 'mpro_frags.sdf'
df_fragments = PandasTools.LoadSDF(sdfFile, idName='name', smilesName='SMILES', molColName='mol')

#TODO - dump Mpro and Mac1 crystal info into data folders

# Featurise into pharmacophores

In [3]:
from fresco.featurise import return_pcore_dataframe_for_list_of_mols, calculate_frequency_weights_for_duplicate_fragments

pcore_df = return_pcore_dataframe_for_list_of_mols(df_fragments['mol'].values)

100%|██████████| 23/23 [00:00<00:00, 180.81it/s]


Calculate pharmacophore distance histogram and weight histograms

In [4]:
from fresco.featurise import return_default_pharmacophore_pairs, calculate_pairwise_distances_between_pharmacophores_for_fragment_ensemble

interesting_pcores = return_default_pharmacophore_pairs()
frag_pcore_histogram = {}
frag_pcore_weight = {}
for pcore_pair in interesting_pcores:
    core_a,core_b = pcore_pair.split('-')
    frag_pcore_histogram[pcore_pair], frag_pcore_weight[pcore_pair] = calculate_pairwise_distances_between_pharmacophores_for_fragment_ensemble(pcore_df, core_a, core_b)

# Fit FRESCO model

In [7]:
from fresco.model import fit_fresco_on_pcore_histograms

kde = fit_fresco_on_pcore_histograms(
    frag_pcore_histogram, interesting_pcores, frag_pcore_weight)


# Use FRESCO to score a molecule

In [17]:
from fresco.featurise import return_pcore_dataframe_from_single_rdkit_molecule
from rdkit.Chem import AllChem

smiles = 'Cc1ccccc1CNc1ccccc1NC(=O)[C@@H](O)c1cccnc1'
mol = Chem.MolFromSmiles(smiles)
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol)

mol_pcore_df = return_pcore_dataframe_from_single_rdkit_molecule(mol)
#

Generate pharmacophore 2-body distribution

In [18]:
from fresco.featurise import return_default_pharmacophore_pairs, calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand

pcore_pairs = return_default_pharmacophore_pairs()

pair_distribution_for_this_ligand = {}
    
for pcore_pair in pcore_pairs:
    core_a,core_b = pcore_pair.split('-')
    pair_distribution_for_this_ligand[pcore_pair] = calculate_pairwise_distances_between_pharmacophores_for_a_single_ligand(
        mol_pcore_df, core_a, core_b)

Load FRESCO model

In [19]:
import dill as pickle
from fresco.model import load_kde_model

pickle_dir = '/home/wjm41/ml_physics/frag-pcore-screen/data/EnamineREAL/pickles/'
xray_kde_mpro = 'kde_dict_spl_mpro.pickle'
xray_kde_dict = pickle.load(open(pickle_dir+xray_kde_mpro, 'rb'))


Score the molecule with the KDE!

In [20]:
import numpy as np
import pandas as pd
from fresco.model import score_dist


score_df_for_this_molecule = pd.DataFrame(columns=pcore_pairs)

for pcore_combination in pcore_pairs:
    kde_for_this_combination = kde[pcore_combination]
    pcore_dist = pair_distribution_for_this_ligand[pcore_combination].reshape(
        -1, 1)
    pcore_score = score_dist(kde_for_this_combination, pcore_dist)
    score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

scores = score_df_for_this_molecule[pcore_pairs].to_numpy().astype(
    float)
processed_score_for_this_molecule = np.nanmean(scores)
print(processed_score_for_this_molecule)
            
# def score_with_kde(kde_model, col_name):
    
#     df_docking[col_name] = -100.0
#     for i, pair_dist in enumerate(pair_distribution_for_this_ligand):
#         score_df_for_this_molecule = pd.DataFrame(columns=pcore_pairs)

#         for pcore_combination in pcore_pairs:
#                     kde_for_this_combination = kde[pcore_combination]
#                     pcore_dist = pair_dist[pcore_combination].reshape(
#                         -1, 1)
#                     pcore_score = score_dist(kde_for_this_combination, pcore_dist)
#                     score_df_for_this_molecule.at[0, pcore_combination] = pcore_score

#                 scores = score_df_for_this_molecule[pcore_pairs].to_numpy().astype(
#                     float)
#                 processed_score_for_this_molecule = np.nanmean(scores)

#         df_docking.at[i, col_name] = processed_score_for_this_molecule
        
# score_with_kde(xray_kde_dict, 'xray_fresco_score')

# df_docking


-1.9061861647263292
