In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem

In [2]:
df = pd.read_csv("data/NiCOlit.csv")

In [14]:
# remove reviews
df2 = df.loc[df.review == "Research_Article"]

# convert time and temperature
from descriptors.dft_featurisation import temperatures, times
T = temperatures(df2)
t = times(df2)

df3 = df2.copy()
df3['time_h'] = t
df3['temperature'] = T

# convert catalysts into SMILES
from descriptors.dictionnaries import dict_smiles_catalysts
dict_smiles_catalysts.update({0 : ''})
df3.catalyst_precursor = df2.catalyst_precursor.fillna(0)
smi_cat = [dict_smiles_catalysts[cat] for cat in df3.catalyst_precursor]
df3['smi_catalyst_precursor'] = smi_cat

# convert solvent into SMILES
from descriptors.dictionnaries import dict_solvants
df3['smi_solvent'] = [dict_solvants[sol] for sol in df3.solvent]

# convert ligand into SMILES
from descriptors.dictionnaries import dict_ligand
dict_ligand.update({0 : ''})
df3.effective_ligand = df2.effective_ligand.fillna(0)
df3['smi_effective_ligand'] = [dict_ligand[lig] for lig in df3.effective_ligand]

# process yields
from descriptors.dft_featurisation import process_yield
iso_y = [process_yield(y) for y in df3.isolated_yield]
ana_y = [process_yield(y) for y in df3.analytical_yield]
Y = []
for i, y in enumerate(iso_y):
    if y is not None:
        Y.append(y)
    else:
        Y.append(ana_y[i])
        
df3['yield'] = Y

In [15]:
df3.columns

Index(['substrate', 'coupling_partner', 'effective_coupling_partner',
       'solvent', 'time', 'temperature', 'catalyst_precursor', 'reagents',
       'effective_reagents', 'effective_reagents_covalent', 'reductant',
       'ligand', 'effective_ligand', 'product', 'analytical_yield',
       'isolated_yield', 'coupling_partner_class', 'DOI', 'origin',
       'eq_substrate', 'eq_coupling_partner', 'eq_catalyst', 'eq_ligand',
       'eq_reagent', '2_steps', 'scheme_table', 'review', 'Mechanism',
       'time_h', 'smi_catalyst_precursor', 'smi_solvent',
       'smi_effective_ligand', 'yield'],
      dtype='object')

In [16]:
list_cols = ['substrate', 'eq_substrate', 
             'effective_coupling_partner', 'eq_coupling_partner',
             'smi_catalyst_precursor', 'eq_catalyst', 
             'smi_solvent', 
             'effective_reagents', 'eq_reagent', 
             'smi_effective_ligand', 'eq_ligand',
             'temperature',
             'time_h',
             'product', 'yield'
            ]
df_ord = df3[list_cols]

In [18]:
df_ord.to_csv("data/ORD/NiCOlit_for_ORD.csv")