# Manual Cleaning

The aim of the manual cleaning is to canonicalise the catalysts, reagents, and solvents
canonical means that there's only one representation for each molecule
Some molecules may be represented in a way that the rdkit canonicalisation won't pick up that it's the same
E.g. [Pd+2] and [Pd]; [Pd].Ph and [Pd].[Pd].Ph.Ph

Important note: All the examples I listed here are just examples, I could do these things myself as well. Your chemistry knowledge is going beyond the examples I already listed, e.g. which metals to look for, whether mol.mol is the same as mol (e.g. is Ph.Ph the same as Ph? Is Ph.Ph.Pd the same as Ph.Pd?) etc. 

In [2]:
import pandas as pd
# read in pickled clean data
df = pd.read_pickle("cleaned_data.pkl")

### Clean catalysts

In [5]:
# print out all catalysts
#sorted(list(set(df['catalyst_0'].dropna())))

# initialize a dict that maps catalysts to the humanly cleaned smiles
catalyst_replacements = {}

catalyst_wrong = []
# All the data should have already been cleaned using rdkit.canonsmiles so I'm very surprised that there are some catalysts that are wrong. If you see any wrong catalysts, just remove them

In [6]:
# Add a catalyst to the catalyst_replacements df
catalyst_replacements['CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+3].[Rh+3]'] = 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+2].[Rh+2]'
catalyst_replacements['[CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+3]]'] = 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+2].[Rh+2]'
catalyst_replacements['[CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C]'] = 'CC(C)(C)[PH]([Pd][PH](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C'
catalyst_replacements['CCCC[N+](CCCC)(CCCC)CCCC.CCCC[N+](CCCC)(CCCC)CCCC.CCCC[N+](CCCC)(CCCC)CCCC.[Br-].[Br-].[Br-]'] = 'CCCC[N+](CCCC)(CCCC)CCCC.[Br-]'
catalyst_replacements['[CCO.CCO.CCO.CCO.[Ti]]'] = 'CCO[Ti](OCC)(OCC)OCC'
catalyst_replacements['[CC[O-].CC[O-].CC[O-].CC[O-].[Ti+4]]'] = 'CCO[Ti](OCC)(OCC)OCC'
catalyst_replacements['[Cl[Ni]Cl.c1ccc(P(CCCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1]'] = 'Cl[Ni]1(Cl)[P](c2ccccc2)(c2ccccc2)CCC[P]1(c1ccccc1)c1ccccc1'
catalyst_replacements['[Cl[Pd](Cl)([P](c1ccccc1)(c1ccccc1)c1ccccc1)[P](c1ccccc1)(c1ccccc1)c1ccccc1]'] = 'Cl[Pd](Cl)([PH](c1ccccc1)(c1ccccc1)c1ccccc1)[PH](c1ccccc1)(c1ccccc1)c1ccccc1'
catalyst_replacements['[Cl[Pd+2](Cl)(Cl)Cl.[Na+].[Na+]]'] = 'Cl[Pd]Cl'
catalyst_replacements['Karstedt catalyst'] = 'C[Si](C)(C=C)O[Si](C)(C)C=C.[Pt]'
catalyst_replacements["Karstedt's catalyst"] = 'C[Si](C)(C=C)O[Si](C)(C)C=C.[Pt]'
catalyst_replacements['[O=C([O-])[O-].[Ag+2]]'] = 'O=C([O-])[O-].[Ag+].[Ag+]'
catalyst_replacements['[O=S(=O)([O-])[O-].[Ag+2]]'] = 'O=S(=O)([O-])[O-].[Ag+].[Ag+]'
catalyst_replacements['[O=[Ag-]]'] = 'O=[Ag]'
catalyst_replacements['[O=[Cu-]]'] = 'O=[Cu]'
catalyst_replacements['[Pd on-carbon]'] = '[C].[Pd]'
catalyst_replacements['[TEA]'] = 'OCCN(CCO)CCO'
catalyst_replacements['[Ti-superoxide]'] = 'O=[O-].[Ti]'
catalyst_replacements['[[Pd].c1ccc(P(c2ccccc2)c2ccccc2)cc1]'] = '[Pd].c1ccc(P(c2ccccc2)c2ccccc2)cc1.c1ccc(P(c2ccccc2)c2ccccc2)cc1.c1ccc(P(c2ccccc2)c2ccccc2)cc1.c1ccc(P(c2ccccc2)c2ccccc2)cc1'
catalyst_replacements['[c1ccc([PH](c2ccccc2)(c2ccccc2)[Pd-4]([PH](c2ccccc2)(c2ccccc2)c2ccccc2)([PH](c2ccccc2)(c2ccccc2)c2ccccc2)[PH](c2ccccc2)(c2ccccc2)c2ccccc2)cc1]'] = 'c1ccc([PH](c2ccccc2)(c2ccccc2)[Pd]([PH](c2ccccc2)(c2ccccc2)c2ccccc2)([PH](c2ccccc2)(c2ccccc2)c2ccccc2)[PH](c2ccccc2)(c2ccccc2)c2ccccc2)cc1'
catalyst_replacements['[c1ccc([P]([Pd][P](c2ccccc2)(c2ccccc2)c2ccccc2)(c2ccccc2)c2ccccc2)cc1]'] = 'c1ccc([PH](c2ccccc2)(c2ccccc2)[Pd]([PH](c2ccccc2)(c2ccccc2)c2ccccc2)([PH](c2ccccc2)(c2ccccc2)c2ccccc2)[PH](c2ccccc2)(c2ccccc2)c2ccccc2)cc1'
catalyst_replacements['[c1ccc([P](c2ccccc2)(c2ccccc2)[Pd]([P](c2ccccc2)(c2ccccc2)c2ccccc2)([P](c2ccccc2)(c2ccccc2)c2ccccc2)[P](c2ccccc2)(c2ccccc2)c2ccccc2)cc1]'] = 'c1ccc([PH](c2ccccc2)(c2ccccc2)[Pd]([PH](c2ccccc2)(c2ccccc2)c2ccccc2)([PH](c2ccccc2)(c2ccccc2)c2ccccc2)[PH](c2ccccc2)(c2ccccc2)c2ccccc2)cc1'
catalyst_replacements['[sulfated tin oxide]'] = 'O=S(O[Sn])(O[Sn])O[Sn]'
catalyst_replacements['[tereakis(triphenylphosphine)palladium(0)]'] = 'c1ccc([PH](c2ccccc2)(c2ccccc2)[Pd]([PH](c2ccccc2)(c2ccccc2)c2ccccc2)([PH](c2ccccc2)(c2ccccc2)c2ccccc2)[PH](c2ccccc2)(c2ccccc2)c2ccccc2)cc1'
catalyst_replacements['[zeolite]'] = 'O=[Al]O[Al]=O.O=[Si]=O'

catalyst_replacements

{'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+3].[Rh+3]': 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+2].[Rh+2]',
 '[CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+3]]': 'CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].CC(=O)[O-].[Rh+2].[Rh+2]',
 '[CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C]': 'CC(C)(C)[PH]([Pd][PH](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C',
 'CCCC[N+](CCCC)(CCCC)CCCC.CCCC[N+](CCCC)(CCCC)CCCC.CCCC[N+](CCCC)(CCCC)CCCC.[Br-].[Br-].[Br-]': 'CCCC[N+](CCCC)(CCCC)CCCC.[Br-]',
 '[CCO.CCO.CCO.CCO.[Ti]]': 'CCO[Ti](OCC)(OCC)OCC',
 '[CC[O-].CC[O-].CC[O-].CC[O-].[Ti+4]]': 'CCO[Ti](OCC)(OCC)OCC',
 '[Cl[Ni]Cl.c1ccc(P(CCCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1]': 'Cl[Ni]1(Cl)[P](c2ccccc2)(c2ccccc2)CCC[P]1(c1ccccc1)c1ccccc1',
 '[Cl[Pd](Cl)([P](c1ccccc1)(c1ccccc1)c1ccccc1)[P](c1ccccc1)(c1ccccc1)c1ccccc1]': 'Cl[Pd](Cl)([PH](c1ccccc1)(c1ccccc1)c1ccccc1)[PH](c1ccccc1)(c1ccccc1)c1ccccc1',
 '[Cl[Pd+2](Cl)(Cl)Cl.[Na+].[Na+]]': 'Cl[Pd]Cl',
 'Karstedt catalyst': 'C[Si](C)(C=C)O[Si](C)(C

In [7]:
# add any wrong catalysts you spot, e.g.
catalyst_wrong += ['Catalyst A',
'catalyst',
'catalyst 1',
'catalyst A',
'catalyst VI',
'reaction mixture',
'same catalyst',
'solution']

catalyst_wrong

['Catalyst A',
 'catalyst',
 'catalyst 1',
 'catalyst A',
 'catalyst VI',
 'reaction mixture',
 'same catalyst',
 'solution']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b92abcc-3671-4dc8-8999-6e4ce852d42d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>