<small><i>This notebook was put together by [Wesley Beckner](http://wesleybeckner.github.io/).</i></small>

<a id='top'></a>

## 2D descriptor selection

load json parsing [here](#json)

load descriptor generation [here](#descriptors)

check datasets [here](#check)

In [3]:
import json
import os
import sys
import pandas as pd
import numpy as np
from __future__ import print_function

#rdkit modules
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator as Calculator


#scikit modules
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

#plot modules and formatting
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc("figure", facecolor="white")
params = {
    'lines.markersize' : 10,
    'axes.labelsize': 20,
    'font.size': 20,
    'legend.fontsize': 20,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20,
    'text.usetex': False,
   }
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]   
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.) 

#grab our checkName code
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import salty

<a id='json'></a>

# Parse JSON files

[back to top](#top)

In [2]:
###add JSON files to density.csv
outer_old = pd.DataFrame()
outer_new = pd.DataFrame()

for i in range(2266):
    with open("../salty/data/DENSITY/%s.json" % str(i+1)) as json_file:
        
        #grab data, data headers (names), the salt name
        json_full = json.load(json_file)
        json_data = pd.DataFrame(json_full['data'])
        json_datanames = np.array(json_full['dhead'])
        json_data.columns =  json_datanames
        json_saltname = pd.DataFrame(json_full['components'])
        print(json_saltname.iloc[0][3])
        
        inner_old = pd.DataFrame()
        inner_new = pd.DataFrame()
        
        #loop through the columns of the data, note that some of the 
        #json files are missing pressure data. 
        for indexer in range(len(json_data.columns)):
            grab=json_data.columns[indexer]
            list = json_data[grab]
            my_list = [l[0] for l in list]
            dfmy_list = pd.DataFrame(my_list)
            dfmy_list.columns = [json_datanames[indexer][0]]
            inner_new = pd.concat([dfmy_list, inner_old], axis=1)
            inner_old = inner_new
            
        #add the name of the salt    
        inner_old['salt_name']=json_saltname.iloc[0][3]           
        
        #add to the growing dataframe
        outer_new = pd.concat([inner_old, outer_old], axis=0)
        outer_old = outer_new
print(outer_old)
pd.DataFrame.to_csv(outer_old, path_or_buf='../salty/data/density.csv', index=False)

1-hexyl-3-methylimidazolium bis[(trifluoromethyl)sulfonyl]imide
1-butyl-3-methylimidazolium tetrafluoroborate
1-butyl-3-methylimidazolium methylsulfate
1-ethyl-3-methylimidazolium ethyl sulfate
2-hydroxy-N-methylethanaminium pentanoate
2-hydroxy-N-methylethanaminium propionate
N-methyl-2-hydroxyethylammonium acetate
2-hydroxy-N-methylethanaminium formate
2-hydroxy-N-methylethanaminium isobutyrate
N-methyl-2-hydroxyethylammonium butanoate
1-butyl-3-methylimidazolium bis(trifluoromethylsulfonyl)imide
1-ethyl-3-methylimidazolium ethyl sulfate
1-butyl-3-methylimidazolium tetrafluoroborate
1-butyl-3-methylimidazolium hexafluorophosphate
1-hexyl-3-methylimidazolium hexafluorophosphate
1-ethyl-3-methylimidazolium tetrafluoroborate
1-octyl-3-methylimidazolium hexafluorophosphate
1-hexyl-3-methylimidazolium tetrafluoroborate
tetradecyl(trihexyl)phosphonium dicyanamide
1-butyl-3-methylimidazolium bis(trifluoromethylsulfonyl)imide
1-butyl-3-methylimidazolium octyl sulfate
1-butyl-3-methylimidazol

KeyboardInterrupt: 

In [None]:
###read in the salts csv; add cat_name, an_name, cat_SMILES, and an_SMILES
###columns. Parse salt_name into cation and anion components and append to
###respective columns. Make the call to checkName with those entries and
###append to respective SMILES columns.
salts=pd.read_csv('../salty/data/density.csv',delimiter=',')
salts['cation_name'] = np.nan
salts['anion_name'] = np.nan
salts['cation_SMILES'] = np.nan
salts['anion_SMILES'] = np.nan
for i in range(salts.shape[0]):
    salts.ix[i, 'cation_name'] = salts['salt_name'].iloc[i].split()[0]
    try:
        salts.ix[i, 'cation_SMILES'] = salty.checkName(salts['cation_name'].iloc[i])
    except:
        pass
    if len(salts['salt_name'].iloc[i].split()) == 2:
        salts.ix[i, 'anion_name'] = salts['salt_name'].iloc[i].split()[1]
    elif len(salts['salt_name'].iloc[i].split()) == 1:
        print("only one compound listed")
    else:
        print("long anion name")
        salts.ix[i, 'anion_name'] = (salts['salt_name'].iloc[i].split()[1] + \
        " " + salts['salt_name'].iloc[i].split()[2])
    try:
        salts.ix[i, 'anion_SMILES'] = salty.checkName(salts['anion_name'].iloc[i])
    except:
        pass

query Empty DataFrame
Columns: [Unnamed: 0, cation_SMILES, anion_SMILES, salt_SMILES, cation_name, anion_name, salt_name]
Index: [] not found
user has queried with a name
your query has returned C(#N)[N-]C#N
user has queried with a name
your query has returned CCN1C=C[N+](=C1)C
user has queried with a name
your query has returned C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F
user has queried with a name
your query has returned CCCCCCCCCCCCCC[P+](CCCCCC)(CCCCCC)CCCCCC
user has queried with a name
your query has returned [Cl-]
query Empty DataFrame
Columns: [Unnamed: 0, cation_SMILES, anion_SMILES, salt_SMILES, cation_name, anion_name, salt_name]
Index: [] not found
query Empty DataFrame
Columns: [Unnamed: 0, cation_SMILES, anion_SMILES, salt_SMILES, cation_name, anion_name, salt_name]
Index: [] not found
query Empty DataFrame
Columns: [Unnamed: 0, cation_SMILES, anion_SMILES, salt_SMILES, cation_name, anion_name, salt_name]
Index: [] not found
user has queried with a name
your query has retu

In [6]:
pd.DataFrame.to_csv(salts, path_or_buf='../salty/data/salts_with_smiles.csv', index=False)

In [7]:
###remove all the salts not found in our database and
###save the truncated salt data
salts=pd.read_csv('../salty/data/salts_with_smiles.csv',delimiter=',')
salts = salts.loc[salts['cation_SMILES'] != "0"]
salts = salts.loc[salts['cation_SMILES'] != "0.0"]
salts = salts.loc[salts['anion_SMILES'] != "0"]
salts = salts.loc[salts['anion_SMILES'] != "0.0"]
salts['salt_SMILES'] = salts['cation_SMILES'] + " " + salts['anion_SMILES']
pd.DataFrame.to_csv(salts, path_or_buf='../salty/data/salts_with_smiles.csv', index=False)

<a id='descriptors'></a>

# Create Descriptors

[back to top](#top)

In [9]:
###based on coco's Deslist

data=pd.read_csv('../salty/data/salts_with_smiles.csv',delimiter=',')
print(data.columns)
n = data.shape[0]
f = open('../salty/data/Deslist','r')
Deslist = []
for line in f:
    Deslist.append(line.strip('\n\t'))
calc = Calculator(Deslist)
D = len(Deslist)
d = len(Deslist)*2+6
print(n,d)

Index([u'Molar volume, m<SUP>3</SUP>/mol', u'Pressure, kPa',
       u'Specific density, kg/m<SUP>3</SUP>',
       u'Specific volume, m<SUP>3</SUP>/kg', u'Temperature, K', u'salt_name',
       u'cation_name', u'anion_name', u'cation_SMILES', u'anion_SMILES',
       u'salt_SMILES'],
      dtype='object')
9172 194


In [10]:
X = np.zeros((n,d))
X[:,-3] = data['Temperature, K']
X[:,-2] = data['Pressure, kPa']
for i in range(n):
    cation = Chem.MolFromSmiles(data['cation_SMILES'][i])
    anion  = Chem.MolFromSmiles(data['anion_SMILES'][i])
    X[i][:D]    = calc.CalcDescriptors(cation)
    X[i][D:2*D] = calc.CalcDescriptors(anion)

X[:,-1] = data['Specific density, kg/m<SUP>3</SUP>']

cols_cat = [s + "-cation" for s in Deslist]
cols_ani = [s + "-anion" for s in Deslist]
cols = cols_cat + cols_ani + ["salt_name" , "NAME_CAT", "NAME_ANI", "Temperature_K" , "Pressure_kPa", "Density_kg/m"]

X = pd.DataFrame(X, columns=cols)
X.iloc[:,-6] = data['salt_name']
X.iloc[:,-5] = data['cation_name']
X.iloc[:,-4] = data['anion_name']

print(X.shape)

pd.DataFrame.to_csv(X, path_or_buf='../salty/data/salts_with_descriptors.csv', index=False)

(9172, 194)


<a id='check'></a>

# Check Dataset

[back to top](#top)

In [18]:
data=pd.read_csv('../salty/data/salts_with_descriptors.csv')
data = data.loc[data["NAME_CAT"].str.contains("octyl", case=False)]
data

Unnamed: 0,steiger-cation,Marsili Partial Charges-cation,BalabanJ-cation,BertzCT-cation,Ipc-cation,HallKierAlpha-cation,Kappa1-cation,Kappa2-cation,Kappa3-cation,Chi0-cation,...,VSA_EState8-anion,VSA_EState9-anion,VSA_EState10-anion,Topliss fragments-anion,salt_name,NAME_CAT,NAME_ANI,Temperature_K,Pressure_kPa,Density_kg/m
2,777.0,777.0,1.988225,240.224698,1955.580095,-0.79,11.285700,6.632366,4.447245,10.225768,...,39.000000,-6.000000,0.000000,777.0,1-methyl-3-octylimidazolium tetrafluoroborate,1-methyl-3-octylimidazolium,tetrafluoroborate,298.15,101.325,1102.0
21,777.0,777.0,1.988225,240.224698,1955.580095,-0.79,11.285700,6.632366,4.447245,10.225768,...,0.000000,0.000000,0.000000,777.0,1-methyl-3-octylimidazolium chloride,1-methyl-3-octylimidazolium,chloride,298.15,101.325,1009.6
43,777.0,777.0,1.988225,240.224698,1955.580095,-0.79,11.285700,6.632366,4.447245,10.225768,...,39.000000,-6.000000,0.000000,777.0,1-methyl-3-octylimidazolium tetrafluoroborate,1-methyl-3-octylimidazolium,tetrafluoroborate,298.10,101.325,1104.2
51,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,59.194444,-10.657407,0.000000,777.0,1-octyl-3-methylimidazolium hexafluorophosphate,1-octyl-3-methylimidazolium,hexafluorophosphate,298.15,101.325,1235.7
53,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,59.194444,-10.657407,0.000000,777.0,1-octyl-3-methylimidazolium hexafluorophosphate,1-octyl-3-methylimidazolium,hexafluorophosphate,298.15,101.325,1235.7
54,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,59.194444,-10.657407,0.000000,777.0,1-octyl-3-methylimidazolium hexafluorophosphate,1-octyl-3-methylimidazolium,hexafluorophosphate,298.15,101.325,1235.7
57,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,58.902778,-5.645833,-6.090278,777.0,1-octyl-3-methylimidazolium trifluoromethanesu...,1-octyl-3-methylimidazolium,trifluoromethanesulfonate,298.15,101.325,1120.0
58,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,109.182500,-12.406111,-13.443056,777.0,1-octyl-3-methylimidazolium bis[(trifluorometh...,1-octyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,298.15,101.325,1310.0
61,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,59.194444,-10.657407,0.000000,777.0,1-octyl-3-methylimidazolium hexafluorophosphate,1-octyl-3-methylimidazolium,hexafluorophosphate,298.15,101.325,1239.0
62,777.0,777.0,1.840326,136.283419,1955.580095,-0.08,11.991839,7.243174,4.965018,10.225768,...,109.182500,-12.406111,-13.443056,777.0,1-octyl-3-methylimidazolium bis[(trifluorometh...,1-octyl-3-methylimidazolium,bis[(trifluoromethyl)sulfonyl]imide,298.15,101.000,1322.4
