In [1]:
# load necessary module
import rdkit
from rdkit.Chem import PandasTools
from rdkit import Chem

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')

In [2]:
from rdkit.Chem.Descriptors import ExactMolWt

In [3]:
def getMW(smi):
    mol = Chem.MolFromInchi(smi)
    if mol:
        return ExactMolWt(mol)
    else:
        return None

In [4]:
def getNum(smi):
    mol = Chem.MolFromInchi(smi)
    if mol:
        return mol.GetNumAtoms()
    else:
        return None

In [5]:
from collections import Counter, defaultdict

def getEleSpread(smiList):
    element_mol = defaultdict(list)
    element = set()
    for smi in smiList:
        mol = Chem.MolFromInchi(smi)
        if mol:
            for c in set([atom.GetSymbol() for atom in mol.GetAtoms()]):
                #if c not in element_mol:
                    element_mol[c].append(smi)
                #element.add(c)
            #break
    return {i:len(item) for i, item in element_mol.items()}

## 1. PhysProp

In [8]:
df1 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/physprop_processed.csv')


In [9]:
df1.head()

Unnamed: 0,SMILES,InChI,mp,MW,NumAtoms
0,Cl.NC(N)=N,"InChI=1S/CH5N3.ClH/c2-1(3)4;/h(H5,2,3,4);1H",182.3,95.025025,5
1,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,247.5,392.199902,28
2,CC(=O)OCC(=O)[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=...,InChI=1S/C23H30O6/c1-13(24)29-12-19(27)23(28)9...,222.0,402.204239,29
3,CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,174.0,232.084792,17
4,[Br-].C[N+](CCOC(=O)C(O)(C1CCCCC1)c1ccccc1)(CC)CC,"InChI=1S/C21H34NO3.BrH/c1-4-22(3,5-2)16-17-25-...",191.5,427.172206,26


In [10]:
df1.shape

(8656, 5)

In [11]:
len(set(df1['InChI']))

8656

In [12]:
df1['MW'] = df1['InChI'].apply(lambda x: getMW(x))
df1['NumAtoms'] = df1['InChI'].apply(lambda x: getNum(x))

In [13]:
df1.describe()

Unnamed: 0,mp,MW,NumAtoms
count,8656.0,8656.0,8656.0
mean,80.475925,219.073475,14.484404
std,99.353991,109.143682,7.341059
min,-196.0,16.0313,1.0
25%,16.0,144.042259,9.0
50%,80.0,194.057909,13.0
75%,151.5,272.432019,18.0
max,492.5,1237.920161,85.0


In [14]:
df1['source'] = 'PhysProp'

In [16]:
print(getEleSpread(df1['InChI'].tolist()).keys())

dict_keys(['N', 'Cl', 'C', 'O', 'F', 'Br', 'P', 'S', 'I', 'Na', 'B', 'Si', 'As', 'K', 'Se'])


In [18]:
df1.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/physprop_processed.csv', index=False)

## 2. bradley

In [6]:
df2 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/bradley_processed.csv')


In [7]:
df2.head()

Unnamed: 0,SMILES,InChI,mp,MW,NumAtoms
0,C1(CCC1)C,"InChI=1S/C5H10/c1-5-3-2-4-5/h5H,2-4H2,1H3",-161.51,70.07825,5
1,[O-][N+]#N,InChI=1S/N2O/c1-2-3,-90.8,44.001063,3
2,FS(F)(=O)=O,"InChI=1S/F2O2S/c1-5(2,3)4",-135.8,101.958707,5
3,CC(C)N(CCC(c1ccccn1)(c2ccccc2)C(N)=O)C(C)C,InChI=1S/C21H29N3O/c1-16(2)24(17(3)4)15-13-21(...,94.8,339.231063,25
4,BrBr,InChI=1S/Br2/c1-2,-7.2,157.836674,2


In [8]:
df2.shape

(3025, 5)

In [20]:
len(set(df2['InChI']))

3022

In [21]:
df2['MW'] = df2['InChI'].apply(lambda x: getMW(x))
df2['NumAtoms'] = df2['InChI'].apply(lambda x: getNum(x))

In [9]:
df2.describe()

Unnamed: 0,mp,MW,NumAtoms
count,3025.0,3025.0,3025.0
mean,62.254496,180.178042,11.89686
std,95.663463,79.889082,5.494701
min,-188.0,16.0313,1.0
25%,5.0,129.15175,8.0
50%,63.0,166.062994,11.0
75%,129.0,214.039672,14.0
max,438.0,949.178286,62.0


In [23]:
print(getEleSpread(df2['InChI'].tolist()).keys())

dict_keys(['C', 'O', 'N', 'F', 'S', 'Br', 'Cl', 'I', 'B', 'P', 'Si'])


In [35]:
df2.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/bradley_processed.csv', index=False)

In [43]:
df2['source'] = 'Bradley'

## 3. zang

In [24]:
df3_0 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/zang_processed.csv')
df3_1 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/test_zang_processed.csv')


In [25]:
df3_0.head()

Unnamed: 0,SMILES,InChI,mp
0,Cl.NC(N)=N,"InChI=1S/CH5N3.ClH/c2-1(3)4;/h(H5,2,3,4);1H",182.3
1,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,247.5
2,CC(=O)OCC(=O)[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=...,InChI=1S/C23H30O6/c1-13(24)29-12-19(27)23(28)9...,222.0
3,[Br-].C[N+](CCOC(=O)C(O)(C1CCCCC1)c1ccccc1)(CC)CC,"InChI=1S/C21H34NO3.BrH/c1-4-22(3,5-2)16-17-25-...",191.5
4,CN1C(=O)NC(=O)C(CC)(CC)C1=O,InChI=1S/C9H14N2O3/c1-4-9(5-2)6(12)10-8(14)11(...,150.5


In [26]:
df3_0.shape

(6485, 3)

In [27]:
df3_1.head()

Unnamed: 0,SMILES,InChI,mp
0,CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,174.0
1,CC(C)[C@@H](C)/C=C/[C@@H](C)[C@H]1CC[C@H]2/C(/...,InChI=1S/C28H44O/c1-19(2)20(3)9-10-22(5)26-15-...,116.5
2,C[C@]12CCC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[C...,InChI=1S/C21H30O4/c1-20-8-7-13(23)9-12(20)3-4-...,181.0
3,ClC(Cl)(Cl)C(c1ccc(Cl)cc1)c1ccc(Cl)cc1,InChI=1S/C14H9Cl5/c15-11-5-1-9(2-6-11)13(14(17...,108.5
4,CCCCC1C(=O)N(c2ccccc2)N(c2ccccc2)C1=O,InChI=1S/C19H20N2O2/c1-2-3-14-17-18(22)20(15-1...,105.0


In [28]:
df3_1.shape

(2133, 3)

In [29]:
df3 = pd.concat([df3_0, df3_1])

In [30]:
df3.shape

(8618, 3)

In [31]:
len(set(df3['InChI']))

8618

In [32]:
df3['MW'] = df3['InChI'].apply(lambda x: getMW(x))
df3['NumAtoms'] = df3['InChI'].apply(lambda x: getNum(x))

In [33]:
df3.describe()

Unnamed: 0,mp,MW,NumAtoms
count,8618.0,8618.0,8618.0
mean,80.414677,218.998748,14.485495
std,99.05439,108.782703,7.33075
min,-196.0,16.0313,1.0
25%,16.0,144.045067,9.0
50%,80.0,194.050281,13.0
75%,151.5,272.226049,18.0
max,385.0,1237.920161,85.0


In [34]:
print(getEleSpread(df3['InChI'].tolist()).keys())

dict_keys(['N', 'Cl', 'C', 'O', 'F', 'Br', 'P', 'I', 'S', 'Na', 'B', 'Si', 'As', 'K', 'Se'])


In [36]:
df3.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/zang_all_processed.csv', index=False)



In [44]:
df3['source'] = 'zang'

## 4. ONES

In [35]:
df4 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/ons_processed.csv')


In [36]:
df4.head()

Unnamed: 0,SMILES,InChI,mp
0,C1(CCC1)C,"InChI=1S/C5H10/c1-5-3-2-4-5/h5H,2-4H2,1H3",-161.505
1,CN(C)C,InChI=1S/C3H9N/c1-4(2)3/h1-3H3,-117.04
2,ClC(Cl)(Cl)Cl,"InChI=1S/CCl4/c2-1(3,4)5",-22.9875
3,C#C,InChI=1S/C2H2/c1-2/h1-2H,-80.75
4,C(=C(Cl)Cl)(C(Cl)(Cl)Cl)Cl,"InChI=1S/C3Cl6/c4-1(2(5)6)3(7,8)9",-72.95


In [37]:
df4.shape

(2701, 3)

In [38]:
len(set(df4['InChI']))

2700

In [39]:
df4['MW'] = df4['InChI'].apply(lambda x: getMW(x))
df4['NumAtoms'] = df4['InChI'].apply(lambda x: getNum(x))

In [40]:
df4.describe()

Unnamed: 0,mp,MW,NumAtoms
count,2701.0,2701.0,2701.0
mean,60.144032,175.689381,11.53906
std,93.468972,75.526257,5.065904
min,-187.75,16.0313,1.0
25%,4.75,128.156501,8.0
50%,60.5,164.08373,11.0
75%,123.4525,208.113086,14.0
max,437.65,949.178286,57.0


In [41]:
print(getEleSpread(df4['InChI'].tolist()).keys())

dict_keys(['C', 'N', 'Cl', 'S', 'O', 'Br', 'F', 'I', 'B', 'P', 'Si'])


In [37]:
df4.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/ones_processed.csv', index=False)



In [45]:
df4['source'] = 'ONES'

## 5. Tetko dataset

In [42]:
df5 = pd.read_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/tetko_processed.csv')

In [43]:
df5.shape

(210751, 5)

In [44]:
df5.head()

Unnamed: 0,SMILES,InChI,mp,MW,NumAtoms
0,C(CCC)C1=NC=CC2=C(C=CC=C12)[N+](=O)[O-],InChI=1S/C13H14N2O2/c1-2-3-6-12-10-5-4-7-13(15...,69.25,230.105528,17
1,ClC=1N=CC2=CC=CC(=C2C1)N,InChI=1S/C9H7ClN2/c10-9-4-7-6(5-12-9)2-1-3-8(7...,176.5,178.029776,12
2,C(CCCCCCCCCCC)C=1C(C=CC(C1)=O)=S,InChI=1S/C18H28OS/c1-2-3-4-5-6-7-8-9-10-11-12-...,131.5,292.186087,20
3,[N+](=O)([O-])OC[C@@]12CC([C@@H]3[C@]4(C=CC(C=...,InChI=1S/C21H27NO6/c1-12(23)16-5-6-17-15-4-3-1...,163.0,389.183838,28
4,[N+](=O)([O-])OC[C@@]12CC[C@@H]3[C@]4(C=CC(C=C...,InChI=1S/C21H27NO5/c1-13(23)17-5-6-19-16-4-3-1...,149.25,373.188923,27


In [45]:
len(set(df5['InChI']))

210751

In [46]:
df5['MW'] = df5['InChI'].apply(lambda x: getMW(x))
df5['NumAtoms'] = df5['InChI'].apply(lambda x: getNum(x))



In [47]:
print(getEleSpread(df5['InChI'].tolist()).keys())



dict_keys(['N', 'O', 'C', 'Cl', 'S', 'F', 'Br', 'P', 'I', 'Si', 'H'])


In [62]:
df5.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/tetko_processed.csv', index=False)



In [60]:
df5.describe()

Unnamed: 0,mp,MW,NumAtoms
count,210751.0,210751.0,210751.0
mean,152.667857,357.566737,24.904247
std,58.941494,110.641,7.95185
min,-270.0,118.041865,9.0
25%,108.5,277.943992,19.0
50%,149.0,347.094916,24.0
75%,193.0,424.299241,30.0
max,457.0,1848.023291,112.0


In [63]:
df5['source'] = 'Tetko'

# Combine all

In [48]:
df_all = pd.concat([df1, df2, df3, df4, df5])

In [50]:
len(set(df_all['InChI']))

219739

In [49]:
df_all.head()

Unnamed: 0,SMILES,InChI,mp,MW,NumAtoms,source
0,Cl.NC(N)=N,"InChI=1S/CH5N3.ClH/c2-1(3)4;/h(H5,2,3,4);1H",182.3,95.025025,5,PhysProp
1,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,InChI=1S/C22H29FO5/c1-12-8-16-15-5-4-13-9-14(2...,247.5,392.199902,28,PhysProp
2,CC(=O)OCC(=O)[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=...,InChI=1S/C23H30O6/c1-13(24)29-12-19(27)23(28)9...,222.0,402.204239,29,PhysProp
3,CCC1(c2ccccc2)C(=O)NC(=O)NC1=O,InChI=1S/C12H12N2O3/c1-2-12(8-6-4-3-5-7-8)9(15...,174.0,232.084792,17,PhysProp
4,[Br-].C[N+](CCOC(=O)C(O)(C1CCCCC1)c1ccccc1)(CC)CC,"InChI=1S/C21H34NO3.BrH/c1-4-22(3,5-2)16-17-25-...",191.5,427.172206,26,PhysProp


In [66]:
df_all.to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/mp_combine_all.csv', index=False)



In [67]:
count = df_all[['source', 'mp']].groupby('source').count().sort_values(by=['mp'], ascending=False)

In [68]:
count

Unnamed: 0_level_0,mp
source,Unnamed: 1_level_1
Tetko,210751
PhysProp,8656
zang,8618
Bradley,3025
ONES,2701


In [69]:
sources = []
stds = []
means = []
inchis = []
for inchi in set(df_all['InChI']):
    select = df_all[df_all['InChI'] == inchi]
    #break
    inchis.append(inchi)
    means.append(select['mp'].mean())
    stds.append(select['mp'].std())
    sources.append(list(set(select['source'])))
    #break

In [70]:
df_all_use = pd.DataFrame({'InChI': inchis, 
                           'MP': means,
                           'std': stds, 
                           'sources': sources})

In [71]:
df_all_use.fillna(0, inplace=True)

In [72]:
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

from collections import Counter

In [73]:
df_all_use['Mol'] = [Chem.MolFromInchi(i) for i in df_all_use['InChI']]



In [74]:
df_all_use['element_'] = df_all_use['Mol'].apply(lambda x: list(Counter(atom.GetSymbol() for atom in x.GetAtoms()).keys()))

In [75]:
ALLOWABLE_ATOM_SYMBOLS = ['H', 'C', 'N', 'O', 'S', 'F', 'I', 'P', 'Cl', 'Br']

df_all_use['filter'] = df_all_use['element_'].apply(lambda x: True if set(x) < set(ALLOWABLE_ATOM_SYMBOLS) else False)

df_all_use1 = df_all_use[df_all_use['filter']].reset_index(drop=True)

In [76]:
df_all_use1.shape

(218626, 7)

In [77]:
df_all_use1[['InChI', 'logS', 'std', 'sources']].to_csv('/scratch/dz1061/gcn/datasets/EXP_database/MeltingPoint/IntermediateDatasets/mp_combine_all_use.csv', index=False)
