# Read in data

In [3]:
import pandas as pd

In [1]:
# This is the data preprocessed in USPTO_preprocessing.ipynb
# There's around 500k reactions, and columns for reactant, product, solvent, reagent, etc.
# So there's quite a bit more data than in Modelling.ipynb

In [4]:
# read in pickled clean data
cleaned_df = pd.read_pickle(f"data/ORD_USPTO/cleaned_data.pkl")

In [5]:
# read in the reaction classes
rxn_classes_filename = '/Users/dsw46/nextmove/HazELNut/build/data/classified_rxn.smi'

with open(rxn_classes_filename) as f:
    lines = f.readlines()
lines = [line.rstrip('\n') for line in lines] # remove the \n at the end of each line

# create df of the reaction classes
# 2 columns: mapped_rxn, rxn_classes
rxns = []
rxn_classes = []
for line in lines:
    try:
        rxn, rxn_class = line.split(' ')
        rxns += [rxn]
        rxn_classes += [rxn_class]
    except AttributeError:
        continue
    
rxn_classes_df = pd.DataFrame(list(zip(rxns, rxn_classes)),
               columns =['mapped_rxn', 'rxn_class'])
    

In [23]:
# combine the two dfs
data_df_temp = cleaned_df.merge(rxn_classes_df, how='inner', left_on='mapped_rxn_0', right_on='mapped_rxn')
len(data_df_temp)

9

In [67]:
# I used the following command to generate the rxn classification:
# ./namerxn -nomap data/mapped_rxn.smi data/classified_rxn.smi

# The -nomap I thought would mean that it wouldn't change the atom mapping, yet it clearly did...
# I'll just have to trust that namerxn didn't change the order of my reactions, and just append the reaction classes, and finally remove any reactions that couldn't be classified
data_df = cleaned_df.copy().reset_index(drop=True)
data_df['rxn_class'] = rxn_classes_df['rxn_class']
data_df = data_df.dropna(subset=['rxn_class'])
data_df.reset_index()
print(len(data_df))

526999


In [72]:
# remove all the unclassified reactions, ie where rxn_class = '0.0'
data_df = data_df[~data_df.rxn_class.str.contains("0.0")]
print(len(data_df))

419295


In [80]:
count = 0
for i in data_df['reagents_0']:
    try:
        if 'pd' in i or 'Pd' in i or 'palladium' in i or 'Palladium' in i:
            count +=1
            print(i)
    except TypeError:
        continue

[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd]
[Pd]
[Pd+2]
[Pd]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
Cl[Pd]Cl
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
Cl[Pd]Cl
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd+2]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]
[Pd]

In [81]:
print('Number of Pd in the reagents columns: ', count )

Number of Pd in the reagents columns:  1008


In [None]:
# 