In [None]:
%cd ..

# Prepare USPTO-sm and USPTO-lg for template-relevance prediction

In [2]:
# if not allready in repo download temprel-fortunato

In [3]:
#export
import requests 

def download_temprel_repo(save_path, chunk_size=128):
    "downloads the template-relevance master branch"
    url = "https://gitlab.com/mefortunato/template-relevance/-/archive/master/template-relevance-master.zip"
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
            
def unzip(path):
    "unzips a file given a path"
    import zipfile
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall(path.replace('.zip',''))

In [4]:
path = './data/temprel-fortunato.zip'
download_temprel_repo(path)
unzip(path)

## install template-relevance from fortunato

In [5]:
%%bash
cd data/temprel-fortunato/template-relevance-master/
pip install -e .

Obtaining file:///publicwork/seidl/testing_projects/release/mhn-react/data/temprel-fortunato/template-relevance-master
Installing collected packages: temprel
  Attempting uninstall: temprel
    Found existing installation: temprel 1.0
    Uninstalling temprel-1.0:
      Successfully uninstalled temprel-1.0
  Running setup.py develop for temprel
Successfully installed temprel


also make sure you have the right rdchiral version

In [None]:
#!pip install -e "git://github.com/connorcoley/rdchiral.git#egg=rdchiral"

In [6]:
#export
# code from fortunato
# could also import  from temprel.data.download import get_uspto_50k but slightly altered ;)
import os
import gzip
import pickle
import requests
import subprocess
import pandas as pd


def download_file(url, output_path=None):
    if not output_path:
        output_path = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(output_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

def get_uspto_480k():
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/raw'):
        os.mkdir('data/raw')
    os.chdir('data/raw')
    download_file(
        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/train.txt.tar.gz',
        'train.txt.tar.gz'
    )
    subprocess.run(['tar', 'zxf', 'train.txt.tar.gz'])
    download_file(
        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/valid.txt.tar.gz',
        'valid.txt.tar.gz'
    )
    subprocess.run(['tar', 'zxf', 'valid.txt.tar.gz'])
    download_file(
        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/test.txt.tar.gz',
        'test.txt.tar.gz'
    )
    subprocess.run(['tar', 'zxf', 'test.txt.tar.gz'])

    with open('train.txt') as f:
        train = [
            {
                'reaction_smiles': line.strip(), 
                'split': 'train'
            }
            for line in f.readlines()
        ]
    with open('valid.txt') as f:
        valid = [
            {
                'reaction_smiles': line.strip(), 
                'split': 'valid'
            }
            for line in f.readlines()
        ]
    with open('test.txt') as f:
        test = [
            {
                'reaction_smiles': line.strip(), 
                'split': 'test'
            }
            for line in f.readlines()
        ]

    df = pd.concat([
        pd.DataFrame(train),
        pd.DataFrame(valid),
        pd.DataFrame(test)
    ]).reset_index()
    df.to_json('uspto_lg_reactions.json.gz', compression='gzip')
    os.chdir('..')
    os.chdir('..')
    return df

def get_uspto_50k():
    '''
    get SI from:
    Nadine Schneider; Daniel M. Lowe; Roger A. Sayle; Gregory A. Landrum. J. Chem. Inf. Model.201555139-53
    '''
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/raw'):
        os.mkdir('data/raw')
    os.chdir('data/raw')
    subprocess.run(['wget', 'https://pubs.acs.org/doi/suppl/10.1021/ci5006614/suppl_file/ci5006614_si_002.zip'])
    subprocess.run(['unzip', '-o', 'ci5006614_si_002.zip'])
    data = []
    with gzip.open('ChemReactionClassification/data/training_test_set_patent_data.pkl.gz') as f:
        while True:
            try:
                data.append(pickle.load(f))
            except EOFError:
                break
    reaction_smiles = [d[0] for d in data]
    reaction_reference = [d[1] for d in data]
    reaction_class = [d[2] for d in data]
    df = pd.DataFrame()
    df['reaction_smiles'] = reaction_smiles
    df['reaction_reference'] = reaction_reference
    df['reaction_class'] = reaction_class
    df.to_json('uspto_sm_reactions.json.gz', compression='gzip')
    os.chdir('..')
    os.chdir('..')
    return df

def get_uspto_golden():
    """ get uspto golden and convert it to smiles dataframe from 
    Lin, Arkadii; Dyubankova, Natalia; Madzhidov, Timur; Nugmanov, Ramil; 
    Rakhimbekova, Assima; Ibragimova, Zarina; Akhmetshin, Tagir; Gimadiev, 
    Timur; Suleymanov, Rail; Verhoeven, Jonas; Wegner, Jörg Kurt; 
    Ceulemans, Hugo; Varnek, Alexandre (2020): 
    Atom-to-Atom Mapping: A Benchmarking Study of Popular Mapping Algorithms and Consensus Strategies. 
    ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.13012679.v1
    """
    if os.path.exists('data/raw/uspto_golden.json.gz'):
        print('loading precomputed')
        return pd.read_json('data/raw/uspto_golden.json.gz', compression='gzip')
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/raw'):
        os.mkdir('data/raw')
    os.chdir('data/raw')
    subprocess.run(['wget', 'https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning/raw/master/data/golden_dataset.zip'])
    subprocess.run(['unzip', '-o', 'golden_dataset.zip']) #return golden_dataset.rdf
    
    from CGRtools.files import RDFRead
    import CGRtools
    from rdkit.Chem import AllChem
    def cgr2rxnsmiles(cgr_rx):    
        smiles_rx = '.'.join([AllChem.MolToSmiles(CGRtools.to_rdkit_molecule(m)) for m in cgr_rx.reactants])
        smiles_rx += '>>'+'.'.join([AllChem.MolToSmiles(CGRtools.to_rdkit_molecule(m)) for m in cgr_rx.products])
        return smiles_rx

    data = {}
    input_file = 'golden_dataset.rdf'
    do_basic_standardization=True
    print('reading and converting the rdf-file')
    with RDFRead(input_file) as f:
            while True:
                try:
                    r = next(f)
                    key = r.meta['Reaction_ID']
                    if do_basic_standardization:
                        r.thiele()
                        r.standardize()
                    data[key] = cgr2rxnsmiles(r)
                except StopIteration:
                    break
    
    print('saving as a dataframe to data/uspto_golden.json.gz')
    df = pd.DataFrame([data],index=['reaction_smiles']).T
    df['reaction_reference'] = df.index
    df.index = range(len(df)) #reindex
    df.to_json('uspto_golden.json.gz', compression='gzip')
    
    os.chdir('..')
    os.chdir('..')
    return df

## run the scripts form temprel
for more details see his documentation [readme](https://gitlab.com/mefortunato/template-relevance#step-1-extract-templates)

an alternative is to run the script
```ssh
python data/temprel-fortunato/template-relevance-master/bin/get_uspto_50k.py
```

In [7]:
#!python data/temprel-fortunato/template-relevance-master/bin/process.py --reactions data/raw/uspto_sm_reactions.json.gz --output-prefix uspto_sm

In [None]:
# or this code ;)
import time
import argparse
import pandas as pd
from temprel.templates.extract import process_for_training, process_for_askcos, templates_from_reactions

reactions_sm = get_uspto_50k() ## get the dataset
templates_sm = templates_from_reactions(reactions_sm, nproc=50)
templates_sm.to_json('data/processed/uspto_sm_templates.df.json.gz', compression='gzip') 
process_for_training(templates_sm, output_prefix='data/processed/uspto_sm_', calc_split='stratified')
# standardize templates
process_for_askcos(templates_sm, template_set_name='uspto_sm_', output_prefix='data/processed/uspto_sm_')

### calculate the applicability matrix.. this will also take some time
if needed install mpi4py

In [None]:
!conda install -c anaconda mpi4py -y

In [None]:
#either running it multiprocessing or single (next one)

In [117]:
!mpirun -n 30 python data/temprel-fortunato/template-relevance-master/bin/calculate_applicabilty.py \
--templates data/processed/uspto_sm_retro.templates.uspto_sm_.json.gz \
--train-smiles data/processed/uspto_sm_train.input.smiles.npy \
--valid-smiles data/processed/uspto_sm_valid.input.smiles.npy \
--test-smiles data/processed/uspto_sm_test.input.smiles.npy \
--output-prefix data/processed/uspto_sm_

elapsed [read]: 0.2093029022216797
elapsed [scatter]: 0.015007734298706055
elapsed [convert]: 0.046427011489868164
elapsed [appl]: 9.622829675674438
elapsed [gather]: 0.002859830856323242
elapsed [save]: 0.1777646541595459
elapsed [scatter]: 0.006601095199584961
elapsed [convert]: 0.017879962921142578
elapsed [appl]: 7.5379109382629395
elapsed [gather]: 0.001821279525756836
elapsed [save]: 0.13137435913085938
elapsed [scatter]: 0.06669163703918457
elapsed [convert]: 0.11849093437194824
elapsed [appl]: 54.74601745605469
elapsed [gather]: 0.022233247756958008
elapsed [save]: 0.8161919116973877


In [10]:
from mhnreact.data import load_templates
# load in the templates
t = load_templates('sm')

In [None]:
# calculate applicability via substructureuniquearch -- fast way
import numpy as np
from mhnreact.molutils import smarts2appl

prods = np.array(templates_sm.products)
template_product_smarts = np.array([t[ti].split('>>')[-1] for ti in t])
%time appl = smarts2appl(prods, template_product_smarts, njobs=60) 

# let's do the same for the large dataset
this might take a while ;) grab a coffee

In [None]:
import time
import argparse
import pandas as pd
from temprel.templates.extract import process_for_training, process_for_askcos, templates_from_reactions

reactions_lg = get_uspto_480k() ## get the dataset
reactions_lg.drop(columns='index', inplace=True) #correcting for a lg specific bug
templates_lg = templates_from_reactions(reactions_lg, nproc=100)
templates_lg.to_json('data/processed/uspto_lg_templates.df.json.gz', compression='gzip') 
process_for_training(templates_lg, output_prefix='data/processed/uspto_lg_', calc_split='stratified')
process_for_askcos(templates_lg, template_set_name='uspto_lg_', output_prefix='data/processed/uspto_lg_')

In [None]:
#or run the script ;) --> won't work --> error with index col
#!python data/temprel-fortunato/template-relevance-master/bin/process.py --reactions data/raw/uspto_lg_reactions.json.gz --nproc 100

In [10]:
!export PATH=$(pwd)/data/temprel-fortunato/template-relevance-master/bin:${PATH}

In [None]:
# was used by fotrunato
#!mpirun -n 60 --oversubscribe python

In [None]:
!mpirun -n 50 python data/temprel-fortunato/template-relevance-master/bin/calculate_applicabilty.py \
--templates data/processed/uspto_lg_retro.templates.uspto_lg_.json.gz \
--train-smiles data/processed/uspto_lg_train.input.smiles.npy \
--valid-smiles data/processed/uspto_lg_valid.input.smiles.npy \
--test-smiles data/processed/uspto_lg_test.input.smiles.npy \
--output-prefix data/processed/uspto_lg_

# finally the data can be loaded 

In [13]:
import os
from mhnreact.data import *
os.listdir('data/processed/')

['uspto_sm_test.appl_matrix.npz',
 'uspto_sm_reactions.uspto_sm_.json.gz',
 'uspto_sm_historian.uspto_sm_.json.gz',
 'uspto_sm_train.input.smiles.npy',
 'uspto_sm_valid.input.smiles.npy',
 'uspto_sm_retro.templates.uspto_sm_.json.gz',
 'uspto_sm_templates.df.json.gz',
 'uspto_sm_test.labels.classes.npy',
 'uspto_sm_valid.appl_matrix.npz',
 'uspto_sm_valid.labels.classes.npy',
 'uspto_sm_train.appl_matrix.npz',
 'uspto_sm_train.labels.classes.npy',
 'uspto_sm_test.input.smiles.npy']

In [14]:
# load in the data
X, y = load_USPTO('sm',is_appl_matrix=False)

train 29816 samples ( 9161 max label)
valid 4482 samples ( 9157 max label)
test 5959 samples ( 9145 max label)


In [15]:
X['train'][0], y['train'][0]

(['[CH3:17][O:18][CH:2]([CH3:3])[c:4]1[n:5][c:6]2[cH:12][cH:11][cH:10][cH:9][c:7]2[nH:8]1'],
 2962)

In [16]:
# load in the applicability matrix
X, y_appl = load_USPTO('sm',is_appl_matrix=True)

train 29816 samples ( 9162 max label)
valid 4482 samples ( 9162 max label)
test 5959 samples ( 9162 max label)


In [17]:
# load in the templates
t = load_templates('sm')