## Fragment NGram  
2019.10.21  
yoh Noguchi (edited by Stephen Wu on 2019.10.24)

__Do not use too many data to begin with!!!__

Goal of this script: Create new molecules by modifying substructures in a list of initial molecules based on a pre-trained fragment NGram generator.

Steps:
1. Prepare a list of initial molecules (in SMILES format)
2. Fragmentation using RECAP in RDKit
3. Keep bigger part as base structures, and extract only the smaller parts
4. Modify the extracted small fragments using a pre-trained NGram
5. Exhaustive combination of all new small fragments generated above with the base structures

In [1]:
from xenonpy.descriptor import Fingerprints
import matplotlib.pyplot as plt
from xenonpy.inverse.iqspr import GaussianLogLikelihood
from xenonpy.inverse.iqspr import NGram
from xenonpy.inverse.iqspr import IQSPR
import csv
import math
import numpy as np
import pandas as pd
import pickle as pk
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Recap
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
import os
from xenonpy.descriptor import FrozenFeaturizer
from xenonpy.descriptor.base import BaseFeaturizer
from xenonpy.descriptor.base import BaseDescriptor
from xenonpy.descriptor import RDKitFP, MACCS, ECFP, AtomPairFP, TopologicalTorsionFP, FCFP
from bayes_opt import BayesianOptimization
from rdkit.Chem import BRICS
from collections import Counter
from scipy.special import comb
from joblib import Parallel, delayed
from tqdm import tqdm

### User parameters
- `CSV_PATH`: directory of csv file to be imported
- `NGRAM_PATH`: directory of saved NGram model
- `FRAGMENT_LENGTH`: max length of SMILES to be considered a small fragment
- `BASE_LENGTH`: max length of SMILES to be considered a base structure
- `CREATED_FRAGMENTS_NUMBER`: number of modified fragments per each extracted small fragment
※ `len(smis_Fragment)` * `CREATED_FRAGMENTS_NUMBER` = total of number of new fragments
- `OUTPUT_FILENAME`: file name of output csv

In [2]:
CSV_PATH = 'XXXXX.csv'
NGRAM_PATH = 'ngram_reorder_12_O20_peter.obj'
FRAGMENT_LENGTH = 30
BASE_LENGTH = 50
CREATED_FRAGMENTS_NUMBER = 5
OUTPUT_FILENAME = 'test.csv'

### Read in data

In [3]:
data = pd.read_csv(CSV_PATH).reset_index(drop=True)

### Verify convertability of SMILES to RDKit MOL

In [4]:
check = []
for i, smiles in enumerate(data['SMILES']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        check.append(False)
    if mol is not None:
        if '.' in smiles:
            check.append(False)
        else:
            check.append(True)

### List of SMILES

In [5]:
smiles_list = list(data['SMILES'][check].unique())

### Fragmentation（RECAP）

In [6]:
frag_list = []
for i in range(len(smiles_list)):
    mol_fu = (Chem.MolFromSmiles(smiles_list[i]))
    decomp = Chem.Recap.RecapDecompose(mol_fu)
    first_gen = [node.mol for node in decomp.children.values()]
    for j in range(len(first_gen)):
        smiles = Chem.MolToSmiles(first_gen[j])
        frag_list.append(smiles)
        
frag_list = list(set(frag_list))    

### Read in NGram model

In [17]:
## Ngram model  loading for Fragments
with open(NGRAM_PATH, 'rb') as f:
    n_gram = pk.load(f)

setattr(n_gram,'min_len',2)
n_gram.sample_order = (1, 20)

### Categorize fragments
- `smis_Fragment`: small fragments
- `smis_Base`: base structures
- `smis_Large`: big structures to be filtered out

In [8]:
smis_Fragment = []
smis_Base = []
smis_Large = []
for smi in frag_list:
    if len(smi) < FRAGMENT_LENGTH:
        smis_Fragment.append(smi)
    elif len(smi) < BASE_LENGTH:
        smis_Base.append(smi)
    else:
        smis_Large.append(smi)

### 【Function】combining fragment with base structure
__\*A B\* -> BA__

In [9]:
def combi_smiles(smis_frag, smis_base):
    smis_frag = smis_frag
    smis_base = smis_base

    # prepare NGram object for use of ext. SMILES
    from xenonpy.inverse.iqspr import NGram
    ngram = NGram()

    # check position of '*'
    mols_base = Chem.MolFromSmiles(smis_base)
    idx_base = [i for i in range(mols_base.GetNumAtoms()) if mols_base.GetAtomWithIdx(i).GetSymbol() == '*']

    # rearrange base SMILES to avoid 1st char = '*'
    if idx_base[0] == 0:
        smis_base_head = Chem.MolToSmiles(mols_base,rootedAtAtom=1)
    else:
        smis_base_head = Chem.MolToSmiles(mols_base,rootedAtAtom=0)

    # converge base to ext. SMILES and pick insertion location
    esmi_base = ngram.smi2esmi(smis_base_head)
    esmi_base = esmi_base[:-1]
    idx_base = esmi_base.index[esmi_base['esmi'] == '*'].tolist()

    # rearrange fragment to have 1st char = '*' and convert to ext. SMILES
    mols_frag = Chem.MolFromSmiles(smis_frag)
    idx_frag = [i for i in range(mols_frag.GetNumAtoms()) if mols_frag.GetAtomWithIdx(i).GetSymbol() == '*']
    smis_frag_head = Chem.MolToSmiles(mols_frag,rootedAtAtom=idx_frag[0])
    esmi_frag = ngram.smi2esmi(smis_frag_head)

    # remove leading '*' and last '!'
    esmi_frag = esmi_frag[1:-1]

    # check open rings of base SMILES
    nRing_base = esmi_base['n_ring'].loc[idx_base[0]]

    # re-number rings in fragment SMILES
    esmi_frag['n_ring'] = esmi_frag['n_ring'] + nRing_base

    # delete '*' at the insertion location
    esmi_base = esmi_base.drop(idx_base[0]).reset_index(drop=True)

    # combine base with the fragment
    ext_smi = pd.concat([esmi_base.iloc[:idx_base[0]], esmi_frag, esmi_base.iloc[idx_base[0]:]]).reset_index(drop=True)
    new_pd_row = {'esmi': '!', 'n_br': 0, 'n_ring': 0, 'substr': ['!']}
    ext_smi.append(new_pd_row, ignore_index=True)

    fin_smi = ngram.esmi2smi(ext_smi)
    mol_fin = Chem.MolFromSmiles(fin_smi)
    #return mol_fin
    return fin_smi

### 【Function】loop `combi_smiles` over all the base structures and list of fragments

In [10]:
def loop_frag(base_list, frag_list):
    comb_smi_list = []
    for smi in tqdm(base_list):
        results_list = Parallel(n_jobs=-1)([delayed(combi_smiles)(smi,s) for s in frag_list])
        comb_smi_list.extend(results_list)
    return comb_smi_list

### 【Function】Generating new fragments based on an initial fragment using pre-trained NGram

In [None]:
def create_Fragments(smi, N_frag, n_gram, max_iter = 100):
    f_list = []
    len_smi = len(smi)
    num_min = int(len_smi/10)
    num_max = (len_smi - 1)
    n_gram.set_params(del_range=[num_min,num_max],max_len=1500, reorder_prob=0)
    
    for _ in range(max_iter):
        smis_Ngram = n_gram.proposal([smi for _ in range(N_frag-len(f_list))])
        f_list += [x for x in list(set(smis_Ngram)) if x.count('*') == 1]
        if len(f_list) == N_frag:
            break
    
    return f_list

### Generate new fragments

In [12]:
Frangments_list = []
escape_num = 0
#smi_list.append(smis_Fragment[0])
results_list = Parallel(n_jobs=-1)([delayed(create_Fragments)(s, CREATED_FRAGMENTS_NUMBER, n_gram) for s in smis_Fragment])

for res in results_list:
    Frangments_list.extend(res) 
#Frangments_list = list(set(Frangments_list))

### Combine list of fragments with list of base structures

In [13]:
res = loop_frag(smis_Base, Frangments_list)

100%|██████████| 5/5 [00:03<00:00,  1.34it/s]


### Output csv file

In [15]:
s_result = pd.Series(res, name='SMILES')
s_result.to_csv(OUTPUT_FILENAME, index=False, header='SMILES')