In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import numpy as np
import pickle
import subprocess

# David Hose's solvents

In [41]:
df = pd.read_csv("DH_solvents.csv")

In [42]:
df["inchikey"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)
df["smiles"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)

In [43]:
out_df = df[["inchikey", "smiles", "Name", "PC1", "PC2", "PC3", "PC4", "PC5"]]
out_df.columns = ["inchikey", "smiles", "name", "pc_1", "pc_2", "pc_3", "pc_4", "pc_5"]
out_df = out_df.drop_duplicates("inchikey")
out_df.to_csv("solvents.csv", index=False)

In [44]:
out_df = out_df.set_index("inchikey")

In [45]:
with open("solvents.pickle", "wb") as file:
    pickle.dump(out_df.to_dict('index'), file)

# Standard states

In [4]:
df = pd.read_csv("standard_states.csv")

In [5]:
df["csmiles"] = df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
df["inchikey"] = df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

In [6]:
out_df = df[["inchikey", "csmiles", "name", "standard_state", "molar_volume"]]
out_df.columns = ["inchikey", "smiles", "name", "standard_state", "molar_volume"]
out_df = out_df.drop_duplicates("inchikey")
out_df = out_df.set_index("inchikey")

In [7]:
with open("standard_states.pickle", "wb") as file:
    pickle.dump(out_df.to_dict('index'), file)

In [8]:
with open("standard_states.pickle", "rb") as file:
    d = pickle.load(file)

# Gaussian solvents

In [53]:
gaussian_solvents_df = pd.read_csv("gaussian_solvents.csv")

In [54]:
gaussian_solvents_df.loc[gaussian_solvents_df["name"] == "a-ChloroToluene", "name"] = "alpha-ChloroToluene"

In [55]:
with open("input.txt", "w") as file:
    for name in gaussian_solvents_df["name"]:
        file.write(name.replace("-mixture", "") + "\n")

In [56]:
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt", shell=True)

CompletedProcess(args='java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt', returncode=0)

In [57]:
smiles = open("output.txt").readlines()

In [58]:
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt", shell=True)

CompletedProcess(args='java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt', returncode=0)

In [59]:
inchikey = open("output.txt").readlines()

In [60]:
smiles = [line.strip() for line in smiles]
inchikey = [line.strip() for line in inchikey]

In [61]:
gaussian_solvents_df["smiles"] = smiles
gaussian_solvents_df["inchikey"] = inchikey

In [62]:
new_gaussian_solvents_df = gaussian_solvents_df[["inchikey", "smiles", "name", "epsilon"]]
new_gaussian_solvents_df["smiles"] = new_gaussian_solvents_df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
new_gaussian_solvents_df = new_gaussian_solvents_df.drop_duplicates("inchikey")

In [63]:
new_gaussian_solvents_df.to_csv("final_gaussian_solvents.csv", index=False)

In [64]:
new_gaussian_solvents_df = new_gaussian_solvents_df.set_index("inchikey")
with open("gaussian_solvents.pickle", "wb") as file:
    pickle.dump(new_gaussian_solvents_df.to_dict('index'), file)

In [65]:
with open("gaussian_solvents.pickle", "rb") as file:
    d = pickle.load(file)

In [101]:
d[b"WEVYAHXRMPXWCK-UHFFFAOYSA-N".decode()]

{'smiles': 'CC#N',
 'name': 'Acetonitrile',
 'epsilon': 0.7146352376009412,
 'h_acidity': -0.2014022187135757,
 'h_basicity': -0.18566352217102094}

# XTB solvents

In [69]:
lines = open("xtb_solvents.txt").readlines()
names = []
epsilons = []
for line in lines:
    name = line.strip().split(", ")[0]
    names.append(name)
    
    epsilon = float(line.strip().split(", ")[1])
    epsilons.append(epsilon)

In [70]:
trivial_names = ["acetone",
                 "acetonitrile",
                 "dichloromethane",
                 "chloroform",
                 "carbon disulfide",
                 "dimethyl formamide",
                 "dimethyl sulfoxide",
                 "diethyl ether",
                 "water",
                 "methanol",
                 "n-hexane",
                 "tetrahydrofuran",
                 "toluene",
                ]

In [71]:
with open("input.txt", "w") as file:
    for name in trivial_names:
        file.write(name.replace("-mixture", "") + "\n")
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt", shell=True)        
smiles = open("output.txt").readlines()
smiles = [line.strip() for line in smiles]

subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt", shell=True)        
inchikey = open("output.txt").readlines()
inchikey = [line.strip() for line in inchikey]

In [72]:
canonical_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in smiles]

In [73]:
xtb_df = pd.DataFrame({"inchikey": inchikey, "smiles": canonical_smiles, "name": names, "epsilon": epsilons})

In [97]:
Chem.MolToInchiKey(Chem.MolFromSmiles("CC#N"))

'WEVYAHXRMPXWCK-UHFFFAOYSA-N'

In [75]:
xtb_df

Unnamed: 0,inchikey,smiles,name,epsilon
0,CSCPPACGZOOCGX-UHFFFAOYSA-N,CC(C)=O,acetone,20.7
1,WEVYAHXRMPXWCK-UHFFFAOYSA-N,CC#N,acetonitrile,37.5
2,YMWUJEATGCHHMB-UHFFFAOYSA-N,ClCCl,ch2cl2,8.93
3,HEDRZPFGACZZDS-UHFFFAOYSA-N,ClC(Cl)Cl,chcl3,4.81
4,QGJOPFRUJISHPQ-UHFFFAOYSA-N,S=C=S,cs2,2.64
5,ZMXDDKWLCZADIW-UHFFFAOYSA-N,CN(C)C=O,dmf,37.0
6,IAZDPXIOMUYVGZ-UHFFFAOYSA-N,CS(C)=O,dmso,46.68
7,RTZKZFJDLAIYFH-UHFFFAOYSA-N,CCOCC,ether,4.33
8,XLYOFNOQVPJJNP-UHFFFAOYSA-N,O,water,80.2
9,OKKJLVBELUTLKV-UHFFFAOYSA-N,CO,methanol,32.7


In [76]:
xtb_df.to_csv("xtb_solvents.csv", index=False)

In [77]:
xtb_df_for_dict = xtb_df.set_index("inchikey")
with open("xtb_solvents.pickle", "wb") as file:
    pickle.dump(xtb_df_for_dict.to_dict('index'), file)

# Epsilons and HB properties

In [85]:
df = pd.read_csv("epsilons_and_hb.csv")

In [86]:
df["csmiles"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
df["inchikey"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

In [87]:
out_df = df[["inchikey", "csmiles", "Name", "Dielectric", "Abraham AH", "Abraham BH"]]
out_df.columns = ["inchikey", "smiles", "name", "epsilon", "h_acidity", "h_basicity"]
out_df = out_df.drop_duplicates("inchikey")

### Write out epsilons

In [88]:
out_df_epsilon = out_df[["inchikey", "smiles", "name", "epsilon"]]
out_df_epsilon = out_df_epsilon.dropna()

In [89]:
out_df_epsilon = out_df_epsilon.set_index("inchikey")
out_df_epsilon.to_csv("final_epsilon.csv")

In [90]:
with open("epsilon.pickle", "wb") as file:
    pickle.dump(out_df_epsilon.to_dict('index'), file)

In [91]:
with open("epsilon.pickle", "rb") as file:
    d = pickle.load(file)

### Write out epsilons + h-bonding

In [92]:
out_df_h = out_df.copy()
out_df_h.dropna(inplace=True)

In [93]:
std_scale = StandardScaler().fit(out_df_h[['epsilon', 'h_acidity', 'h_basicity']])
out_df_h[['epsilon', 'h_acidity', 'h_basicity']] = std_scale.transform(out_df_h[['epsilon', 'h_acidity', 'h_basicity']])

In [94]:
out_df_h = out_df_h.set_index("inchikey")
out_df_h.to_csv("final_epsilon_h.csv")

In [95]:
with open("epsilon_h.pickle", "wb") as file:
    pickle.dump(out_df_h.to_dict('index'), file)

In [96]:
with open("epsilon_h.pickle", "rb") as file:
    d = pickle.load(file)

In [102]:
Chem.MolToInchiKey(Chem.MolFromSmiles("C1CCCC1"))

'RGSFGYAAUTVSQA-UHFFFAOYSA-N'