In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
import numpy as np
import pickle
import subprocess



# David Hose's solvents

Read in PCA dimensions data from DH's article

In [3]:
df = pd.read_csv("DH_solvents.csv")

Canonicalize SMILES and get InChIKeys

In [4]:
df["inchikey"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)
df["smiles"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)

Save data to csv file for use in modelling

In [7]:
out_df = df[["inchikey", "smiles", "Name", "PC1", "PC2", "PC3", "PC4", "PC5"]]
out_df.columns = ["inchikey", "smiles", "name", "pc_1", "pc_2", "pc_3", "pc_4", "pc_5"]
out_df = out_df.drop_duplicates("inchikey")
out_df.to_csv("solvents.csv", index=False)

Save also as pickle file

In [8]:
out_df = out_df.set_index("inchikey")

In [9]:
with open("solvents.pickle", "wb") as file:
    pickle.dump(out_df.to_dict('index'), file)

# Standard states

Read standard states taken from DH data

In [10]:
df = pd.read_csv("standard_states.csv")

Canonicalize

In [11]:
df["csmiles"] = df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
df["inchikey"] = df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

Save pickle file

In [12]:
out_df = df[["inchikey", "csmiles", "name", "standard_state", "molar_volume"]]
out_df.columns = ["inchikey", "smiles", "name", "standard_state", "molar_volume"]
out_df = out_df.drop_duplicates("inchikey")
out_df = out_df.set_index("inchikey")

In [13]:
with open("standard_states.pickle", "wb") as file:
    pickle.dump(out_df.to_dict('index'), file)

# Gaussian solvents

Read Gaussian solvents and epsilon values from file

In [14]:
gaussian_solvents_df = pd.read_csv("gaussian_solvents.csv")

Change name so that OPSIN can understand it

In [15]:
gaussian_solvents_df.loc[gaussian_solvents_df["name"] == "a-ChloroToluene", "name"] = "alpha-ChloroToluene"

Run Gaussian names through OPSIN to get SMILES

In [17]:
with open("input.txt", "w") as file:
    for name in gaussian_solvents_df["name"]:
        file.write(name.replace("-mixture", "") + "\n")

In [21]:
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt", shell=True)

CompletedProcess(args='java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt', returncode=0)

Get InChIKeys

In [22]:
smiles = open("output.txt").readlines()

In [23]:
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt", shell=True)

CompletedProcess(args='java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt', returncode=0)

In [24]:
inchikey = open("output.txt").readlines()

Process data from OPSIN

In [25]:
smiles = [line.strip() for line in smiles]
inchikey = [line.strip() for line in inchikey]

In [26]:
gaussian_solvents_df["smiles"] = smiles
gaussian_solvents_df["inchikey"] = inchikey

Write CSV file

In [27]:
new_gaussian_solvents_df = gaussian_solvents_df[["inchikey", "smiles", "name", "epsilon"]]
new_gaussian_solvents_df["smiles"] = new_gaussian_solvents_df["smiles"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
new_gaussian_solvents_df = new_gaussian_solvents_df.drop_duplicates("inchikey")

In [28]:
new_gaussian_solvents_df.to_csv("final_gaussian_solvents.csv", index=False)

Write pickle file

In [29]:
new_gaussian_solvents_df = new_gaussian_solvents_df.set_index("inchikey")
with open("gaussian_solvents.pickle", "wb") as file:
    pickle.dump(new_gaussian_solvents_df.to_dict('index'), file)

# XTB solvents

Read xtb solvents from text file

In [30]:
lines = open("xtb_solvents.txt").readlines()
names = []
epsilons = []
for line in lines:
    name = line.strip().split(", ")[0]
    names.append(name)
    
    epsilon = float(line.strip().split(", ")[1])
    epsilons.append(epsilon)

In [31]:
trivial_names = ["acetone",
                 "acetonitrile",
                 "dichloromethane",
                 "chloroform",
                 "carbon disulfide",
                 "dimethyl formamide",
                 "dimethyl sulfoxide",
                 "diethyl ether",
                 "water",
                 "methanol",
                 "n-hexane",
                 "tetrahydrofuran",
                 "toluene",
                ]

Calculate SMILES and InChIKeys with OPSIN

In [33]:
with open("input.txt", "w") as file:
    for name in trivial_names:
        file.write(name.replace("-mixture", "") + "\n")
subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -osmi input.txt output.txt", shell=True)        
smiles = open("output.txt").readlines()
smiles = [line.strip() for line in smiles]

subprocess.run("java -jar opsin-2.4.0-jar-with-dependencies.jar -ostdinchikey input.txt output.txt", shell=True)        
inchikey = open("output.txt").readlines()
inchikey = [line.strip() for line in inchikey]

In [34]:
canonical_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles)) for smiles in smiles]

In [35]:
xtb_df = pd.DataFrame({"inchikey": inchikey, "smiles": canonical_smiles, "name": names, "epsilon": epsilons})

Write to CSV

In [36]:
xtb_df.to_csv("xtb_solvents.csv", index=False)

Write to Pickle

In [37]:
xtb_df_for_dict = xtb_df.set_index("inchikey")
with open("xtb_solvents.pickle", "wb") as file:
    pickle.dump(xtb_df_for_dict.to_dict('index'), file)

# Epsilons and HB properties

Read data on epsilons and hydrogen bonding properties

In [39]:
df = pd.read_csv("epsilons_and_hb.csv")

Canonicalize SMILES and get InChIKeys

In [40]:
df["csmiles"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToSmiles)
df["inchikey"] = df["SMILES"].apply(Chem.MolFromSmiles).apply(Chem.MolToInchiKey)

In [41]:
out_df = df[["inchikey", "csmiles", "Name", "Dielectric", "Abraham AH", "Abraham BH"]]
out_df.columns = ["inchikey", "smiles", "name", "epsilon", "h_acidity", "h_basicity"]
out_df = out_df.drop_duplicates("inchikey")

Write epsilons to file

In [42]:
out_df_epsilon = out_df[["inchikey", "smiles", "name", "epsilon"]]
out_df_epsilon = out_df_epsilon.dropna()

In [43]:
out_df_epsilon = out_df_epsilon.set_index("inchikey")
out_df_epsilon.to_csv("final_epsilon.csv")

In [44]:
with open("epsilon.pickle", "wb") as file:
    pickle.dump(out_df_epsilon.to_dict('index'), file)

Write out epsilons + h-bonding to file

In [45]:
out_df_h = out_df.copy()
out_df_h.dropna(inplace=True)

Standardize data so that meaningful distances can be computed later

In [47]:
std_scale = StandardScaler().fit(out_df_h[['epsilon', 'h_acidity', 'h_basicity']])
out_df_h[['epsilon', 'h_acidity', 'h_basicity']] = std_scale.transform(out_df_h[['epsilon', 'h_acidity', 'h_basicity']])

In [48]:
out_df_h = out_df_h.set_index("inchikey")
out_df_h.to_csv("final_epsilon_h.csv")

In [50]:
with open("epsilon_h.pickle", "wb") as file:
    pickle.dump(out_df_h.to_dict('index'), file)