In [16]:
from z.utils import read_excel, set_seeds
from z.smi import canonize_rxn, canonize_smi
from pathlib import Path
from collections import defaultdict
import pandas as pd

In [2]:
DIR = Path('raw')
SEED = 42
DATAS = [DIR / f'{i + 1}.xlsx' for i in range(5)]

In [3]:
set_seeds(42)

In [4]:
df = read_excel(DATAS)
print(len(df))
df.head()

6187


Unnamed: 0,reaction,photocatalyst,base,additive,solvent,time(h),yield/%,source,Unnamed: 8,Unnamed: 9,Unnamed: 10,base/acid
0,C=CC1=CC=CC=C1.FC([S+]2C3=C(C=CC=C3)C4=C2C=CC=...,fac-Ir(ppy)3,,,DCE,2.0,78,"Angew. Chem., Int. Ed., 2012, 51, 9567–9571",37.0,1.0,,
1,C=CC1=CC=CC=C1.FC([S+]2C3=C(C=CC=C3)C4=C2C=CC=...,fac-Ir(ppy)3,,,DCE,2.0,75,"Angew. Chem., Int. Ed., 2012, 51, 9567–9571",37.0,2.0,,
2,C=CC1=CC=CC=C1.FC([S+]2C3=C(C=CC=C3)C4=C2C=CC=...,fac-Ir(ppy)3,,,DCE,2.0,76,"Angew. Chem., Int. Ed., 2012, 51, 9567–9571",37.0,3.0,,
3,C=CC1=CC=CC=C1.FC([S+]2C3=C(C=CC=C3)C4=C2C=CC=...,fac-Ir(ppy)3,,,DCE,2.0,84,"Angew. Chem., Int. Ed., 2012, 51, 9567–9571",37.0,4.0,,
4,C=CC1=CC=CC=C1.FC([S+]2C3=C(C=CC=C3)C4=C2C=CC=...,fac-Ir(ppy)3,,,DCE,2.0,51,"Angew. Chem., Int. Ed., 2012, 51, 9567–9571",37.0,5.0,,


In [5]:
# 去掉不能标准化的反应，抑制输出
df['canonized_rxns'] = df['reaction'].apply(canonize_rxn)
df.dropna(subset=['canonized_rxns'], inplace=True)
df.reset_index(drop=True, inplace=True)
print(len(df))

RDKit ERROR: [18:54:10] Explicit valence for atom # 3 Br, 3, is greater than permitted
[18:54:10] Explicit valence for atom # 3 Br, 3, is greater than permitted
RDKit ERROR: [18:54:10] Explicit valence for atom # 1 N, 4, is greater than permitted
[18:54:10] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [18:54:10] Explicit valence for atom # 1 N, 4, is greater than permitted
[18:54:10] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [18:54:11] SMILES Parse Error: syntax error while parsing: [H2O18]
RDKit ERROR: [18:54:11] SMILES Parse Error: Failed parsing SMILES '[H2O18]' for input: '[H2O18]'
[18:54:11] SMILES Parse Error: syntax error while parsing: [H2O18]
[18:54:11] SMILES Parse Error: Failed parsing SMILES '[H2O18]' for input: '[H2O18]'
RDKit ERROR: [18:54:11] Explicit valence for atom # 3 Br, 3, is greater than permitted
[18:54:11] Explicit valence for atom # 3 Br, 3, is greater than permitted
RDKit ERROR: [18:54:11] Explicit

6175


In [6]:
"""Numerical for Extra Information"""
def get_map_dict(df: pd.DataFrame, col: str):
    map_dict = defaultdict(int)
    for v, k in enumerate(df[col].value_counts().index.to_list()):
        map_dict[k] = v + 1 # Zero for Unknown Type
    return map_dict

In [7]:
# Numerical Catalyst
cat_map_dict = get_map_dict(df, 'photocatalyst')
df['cat_class'] = df['photocatalyst'].apply(lambda x: cat_map_dict[x])

In [8]:
# Numerical Base
base_map_dict = get_map_dict(df, 'base')
df['base_class'] = df['base'].apply(lambda x: base_map_dict[x])

In [9]:
# Numerical Additives
additive_map_dict = get_map_dict(df, 'additive')
df['additive_class'] = df['additive'].apply(lambda x: additive_map_dict[x])

In [10]:
# Numerical solvent
solvent_map_dict = get_map_dict(df, 'solvent')
df['solvent_class'] = df['solvent'].apply(lambda x: solvent_map_dict[x])

In [11]:
print(len(df))
df.to_csv('data.csv')
# Save map dict
import json
with open('cat_map_dict.json','w') as f:
    jsonData = json.dump(cat_map_dict, f)
with open('base_map_dict.json','w') as f:
    jsonData = json.dump(base_map_dict, f)
with open('additive_map_dict.json','w') as f:
    jsonData = json.dump(additive_map_dict, f)
with open('solvent_map_dict.json','w') as f:
    jsonData = json.dump(solvent_map_dict, f)

6175


In [12]:
base_list = df['base'].value_counts()
cat_list = df['photocatalyst'].value_counts()

In [13]:
base_list[:20]

DABCO             283
Cs2CO3            261
K2CO3             190
2,6-lutidine      120
TFA               112
Na2HPO4            93
KOtBu              88
K3PO4              87
Na2CO3             70
/HCl               67
propionic acid     62
DBU                61
TBD                60
Et3N               52
TBADT              51
K2HPO4             48
NaH2PO4            46
AgOAc              46
KH2PO4             44
NaHSO4             44
Name: base, dtype: int64

In [14]:
cat_list[:20]

fac-Ir(ppy)3                  1422
Autocatalysis                  686
Ru(bpy)3(PF6)2                 451
Ru(bpy)3Cl2                    416
eosin Y                        405
Ir(dF(CF3)ppy)2(bpy)]PF6       344
Mes-Acr+                       261
[Ir(dF(CF3)ppy)2(bpy)]PF6      122
4-CzIPN                        112
Ru(ppy)2(dtbbpy)PF6            106
rose bengal                     89
Ir[dF(CF3)ppy]2(bpy)(PF6)       88
Pd(DPEPhos)Cl2                  79
[Ir(dtbbpy)(ppy)2]PF6           75
Mes-Acr-Me+                     72
TBADT                           67
Mn2(CO)10                       62
Pd(PPh3)4                       60
Acriflavine                     54
5,7,12,14-pentacenetetrone      51
Name: photocatalyst, dtype: int64

In [17]:
import rdkit
reactant = 'C=CC1=CC=C(C(C)(C)C)C=C1.O=C2N(OC(C(C)(C)C)=O)C(C3=CC=CC=C32)=O'
product1 = 'CC(C)(C)CC(C4=CC=C(C(C)(C)C)C=C4)=O'
product2 = 'CC(C)(C)/C=C/C4=CC=C(C(C)(C)C)C=C4'
print(canonize_smi(reactant))
print(canonize_smi(product1))
print(canonize_smi(product2))

C=Cc1ccc(C(C)(C)C)cc1.CC(C)(C)C(=O)ON1C(=O)c2ccccc2C1=O
CC(C)(C)CC(=O)c1ccc(C(C)(C)C)cc1
CC(C)(C)/C=C/c1ccc(C(C)(C)C)cc1


In [None]:
'C=CC1=CC=C(C(C)(C)C)C=C1.O=C2N(OC(C(C)(C)C)=O)C(C3=CC=CC=C32)=O|0.1.0.3>>CC(C)(C)CC(C4=CC=C(C(C)(C)C)C=C4)=O'
'C=CC1=CC=C(C(C)(C)C)C=C1.O=C2N(OC(C(C)(C)C)=O)C(C3=CC=CC=C32)=O|0.1.30.3>>CC(C)(C)/C=C/C4=CC=C(C(C)(C)C)C=C4'

C=Cc1ccc(C(C)(C)C)cc1.CC(C)(C)C(=O)ON1C(=O)c2ccccc2C1=O|0.1.0.3>>CC(C)(C)CC(=O)c1ccc(C(C)(C)C)cc1
C=Cc1ccc(C(C)(C)C)cc1.CC(C)(C)C(=O)ON1C(=O)c2ccccc2C1=O|0.1.30.3>>CC(C)(C)/C=C/c1ccc(C(C)(C)C)cc1