# Read in data

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm
from tqdm import trange
# import pyarrow as pa

In [2]:
"""
Disables RDKit whiny logging.
"""
import rdkit.rdBase as rkrb
import rdkit.RDLogger as rkl
logger = rkl.logger()
logger.setLevel(rkl.ERROR)
rkrb.DisableLog('rdApp.error')

In [3]:
# This is the data preprocessed in USPTO_preprocessing.ipynb
# There's around 500k reactions, and columns for reactant, product, solvent, reagent, etc.
# So there's quite a bit more data than in Modelling.ipynb

In [4]:
# read in pickled clean data
cleaned_df = pd.read_pickle(f"data/ORD_USPTO/cleaned_data.pkl")

In [5]:
# read in the reaction classes
rxn_classes_filename = '/Users/dsw46/nextmove/HazELNut/build/data/classified_rxn.smi'

with open(rxn_classes_filename) as f:
    lines = f.readlines()
lines = [line.rstrip('\n') for line in lines] # remove the \n at the end of each line

# create df of the reaction classes
# 2 columns: mapped_rxn, rxn_classes
rxns = []
rxn_classes = []
for line in lines:
    try:
        rxn, rxn_class = line.split(' ')
        rxns += [rxn]
        rxn_classes += [rxn_class]
    except AttributeError:
        continue
    
rxn_classes_df = pd.DataFrame(list(zip(rxns, rxn_classes)),
               columns =['mapped_rxn', 'rxn_class'])
    

In [6]:
# combine the two dfs
data_df_temp = cleaned_df.merge(rxn_classes_df, how='inner', left_on='mapped_rxn_0', right_on='mapped_rxn')
len(data_df_temp)

9

In [7]:
# I used the following command to generate the rxn classification:
# ./namerxn -nomap data/mapped_rxn.smi data/classified_rxn.smi

# The -nomap I thought would mean that it wouldn't change the atom mapping, yet it clearly did...
# I'll just have to trust that namerxn didn't change the order of my reactions, and just append the reaction classes, and finally remove any reactions that couldn't be classified
data_df = cleaned_df.copy().reset_index(drop=True)
data_df['rxn_class'] = rxn_classes_df['rxn_class']
data_df = data_df.dropna(subset=['rxn_class'])
data_df.reset_index()
print(len(data_df))

526999


In [8]:
# remove all the unclassified reactions, ie where rxn_class = '0.0'
remove_unclassified_rxn_data_df = data_df[~data_df.rxn_class.str.contains("0.0")]
print(len(remove_unclassified_rxn_data_df))

419295


In [9]:
count = 0
for i in data_df['reagents_0']:
    try:
        if 'pd' in i or 'Pd' in i or 'palladium' in i or 'Palladium' in i:
            count +=1
            #print(i)
    except TypeError:
        continue
print('Number of Pd in the reagents columns: ', count )

Number of Pd in the reagents columns:  1205


## Add a cluster column

In [10]:
data_df['rxn_super_class'] = data_df['rxn_class'].str.rsplit('.', expand=True)[0].astype(int)
test_df = data_df['rxn_class'].str.rsplit(';', expand=True)
# 2.5% of reactions have been assigned 2 reaction classes. 3 or 4 reaction classes is very rare.

# Prepare fingerprints

In [11]:
from modelling_2 import calc_fp
from modelling_2 import calc_fp_individual
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
from rdkit.rdBase import BlockLogs
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem

In [12]:
%%time
num_cores = multiprocessing.cpu_count()
inputs = tqdm(data_df['product_0'])
p0 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['product_1'])
p1 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['product_2'])
p2 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['product_3'])
p3 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

100%|██████████| 526999/526999 [01:08<00:00, 7655.97it/s]
100%|██████████| 526999/526999 [02:21<00:00, 3719.65it/s]
100%|██████████| 526999/526999 [02:29<00:00, 3522.12it/s]
100%|██████████| 526999/526999 [02:30<00:00, 3500.49it/s]


CPU times: user 1min 51s, sys: 1min 27s, total: 3min 18s
Wall time: 8min 32s


In [13]:
%%time
num_cores = multiprocessing.cpu_count()
inputs = tqdm(data_df['reactant_0'])
r0 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['reactant_1'])
r1 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['reactant_2'])
r2 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

inputs = tqdm(data_df['reactant_3'])
r3 = Parallel(n_jobs=num_cores)(delayed(calc_fp_individual)(i) for i in inputs)

100%|██████████| 526999/526999 [00:57<00:00, 9145.03it/s] 
100%|██████████| 526999/526999 [00:52<00:00, 10127.89it/s]
100%|██████████| 526999/526999 [02:11<00:00, 4005.06it/s]
100%|██████████| 526999/526999 [02:20<00:00, 3744.13it/s]


CPU times: user 1min 37s, sys: 1min 14s, total: 2min 51s
Wall time: 6min 23s


In [14]:
# calculate rxn difference fp
# converting one 500k by 2k list to array takes roughly 15s, so the whole thing should take about 2-3 min
# need to split into different cells for memory purposes
ar_p0 = np.array(p0)
ar_p1 = np.array(p1)

In [15]:
ar_p2 = np.array(p2)
ar_p3 = np.array(p3)

In [16]:
ar_r0 = np.array(r0)
ar_r1 = np.array(r1)

In [17]:
ar_r2 = np.array(r2)
ar_r3 = np.array(r3)

In [18]:
rxn_diff_fp = ar_p0 + ar_p1 + ar_p2 + ar_p3 - ar_r0 - ar_r1 - ar_r2 - ar_r3
rxn_diff_fp.shape

(526999, 2048)

In [20]:
#save to pickle
np.save("data/ORD_USPTO/USPTO_rxn_diff_fp.pkl", rxn_diff_fp)

# NN modelling

In [None]:
import torch
import torchmetrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
#unpickle
rxn_diff_fp = np.load("data/ORD_USPTO/USPTO_rxn_diff_fp.pkl", allow_pickle=True)

In [None]:
# create one-hot encoding of reagent1_list

