### Aqueous Organic Estimator Supplementary Notebook

Authors: Grayson Boyer, Vincent Milesi

A notebook for estimating hydration properties of molecules from scratch. Breaks molecules with known properties into second-order groups, solves for the contribution of each group with multilinear regression, and then estimates the properties of molecules from those groups.

Date modified: 10/13/2020

In [1]:
file_name = 'data/organic_second_order_alc_test.csv'
props = ["DhyG", "DhyH", "DhyCp", "V"]
sig_figs = 3

fixed_material_point = True # use a fixed material point when estimating group contributions?
estimate_material_point = False # estimate a material point from available data? Valid only if fixed_material_point = False.
material_point_dict = {"DhyG":7.95, "DhyH":-2.29, "DhyCp":0, "V":1.12} # material point value. Valid only if fixed_material_point = True. Values taken from Plyasunov and Shock 2000.

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 27 09:36:15 2018
Modified on Mar 21, 2019 by Grayson

@author: Vincent
@author: Grayson
"""

# =============================================================================
#                               IMPORT PACKAGES
# =============================================================================

import pandas as pd
import numpy as np
import sys
from math import floor, log10 # for significant figure rounding
import statsmodels.api as sm # for multilinear regression
from string import ascii_lowercase
import itertools

# =============================================================================
#                                     MAIN
# =============================================================================
def main():
    for prop in props:
        if fixed_material_point:
            material_point = material_point_dict[prop]
        else:
            material_point = 0
        group_property_estimator(file_name, prop, sig_figs, material_point)
    
    
# =============================================================================
#                                  FUNCTIONS
# =============================================================================
round_to_n = lambda x, n: round(x, -int(floor(log10(abs(x)))) + (n - 1))

def iter_all_strings():
    for size in itertools.count(1):
        for s in itertools.product(ascii_lowercase, repeat=size):
            yield "".join(s)

def group_property_estimator(file_name, dependent_param, sf, material_point):   
    df_data = pd.read_csv(file_name, index_col=0)  
    
    # remove rows with containing only NaN
    df_data = df_data[np.isfinite(df_data[df_data.columns.values[len(df_data.columns.values)-1]])]
    
    # remove columns containing 0 groups
    df_data = df_data.loc[:, (df_data != 0).any(axis=0)]
    
    # get data subset that needs a prediction
    df_topred = df_data[np.isfinite(df_data[dependent_param]) == False]
    
    # remove rows without y values
    df_data = df_data[np.isfinite(df_data[dependent_param])]
    
    ## define the independent parameter
    X = df_data[[x for x in list(df_data.columns.values) if not x in ["compound", "formula", dependent_param]+props]]
    
    ## define the dependent parameter
    y = df_data[dependent_param]
    
    # get X of molecules to predict
    X_topred = df_topred[[x for x in list(df_topred.columns.values) if not x in ["compound", "formula", dependent_param]+props]]

    if not fixed_material_point:

        ## add an intercept to the multi reg
        if estimate_material_point:
            X["material point"] = 1
            X_topred["material point"] = 1

        multi_reg = sm.OLS(y[0:], X[0:]).fit() # perform the multiple regression
        prediction = multi_reg.predict(X) # make the predictions from the multi_reg
        preds = multi_reg.predict(X_topred)
        
        # set these variables to 0 regardless of whether material point is being estimated
        material_point = 0 
        material_point_err = 0
        
    else:
        from statsmodels.formula.api import ols
        
        # subtract material point from y values (dependent params)
        df_data["y_minus_Yo"] = df_data[dependent_param] - material_point
        
        # ols formulas can't handle SMARTS strings as variables,
        # so store them in a dictionary of simpler strings (generated iteratively)
        # in the form {label:SMARTS}
        label_dict = {}
        X_vars = list(X.columns.values)
        for i,s in enumerate(itertools.islice(iter_all_strings(), len(X_vars))):
            label_dict[s] = X_vars[i]
        
        # create an inverse label dictionary {SMARTS:label}
        inv_label_dict = {v: k for k, v in label_dict.items()}
        
        # rename dependent variable columns with new labels
        X = X.rename(columns=inv_label_dict)
        X_topred = X_topred.rename(columns=inv_label_dict)
        df_data = df_data.rename(columns=inv_label_dict)
        
        formula_str = "y_minus_Yo ~ " + " + ".join(list(X.columns.values)) + " -1"

        multi_reg = ols(formula=formula_str, data=df_data).fit()
        prediction = multi_reg.predict(X) # make the predictions from the multi_reg
        preds = multi_reg.predict(X_topred)
        
        # restore dependent variable column names
        X = X.rename(columns=label_dict)
        X_topred = X_topred.rename(columns=label_dict)
        df_data = df_data.rename(columns=label_dict)
        
        
#     print(multi_reg.summary()) # print out the statistics
#     print(multi_reg.params) 
#     print(multi_reg.params.values) # print param as an array

    group_property_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.params.values]))

    group_property_se_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.bse.values]))

    if fixed_material_point:
        group_property_dict["material_point"] = material_point
        group_property_se_dict["material_point"] = 0 # material point uncertainty fixed at 0 (see Plyasunov and Shock 2000)

        
    pred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X.loc[idx], X.columns.values)])**0.5 for idx in X.index]

    topred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X_topred.loc[idx], X_topred.columns.values)])**0.5 for idx in X_topred.index]


    comp_pred_df = pd.DataFrame({"actual":df_data[dependent_param],
                                 "prediction":[round(pred+material_point, 2) for pred in prediction.values],
                                 "pred errs":[round(err, 2) for err in pred_errs]})

    df_preds = pd.DataFrame({"actual":df_topred[dependent_param],
                                 "prediction":[round(pred+material_point, 2) for pred in preds.values],
                                 "pred errs":[round(err, 2) for err in topred_errs]})

    df_final = comp_pred_df.append(pd.DataFrame(data = df_preds))

    print("\nPredicted group contributions for " + dependent_param + ":")
    print(group_property_dict)
    print("\nPredicted standard error of group contributions for " + dependent_param + ":")
    print(group_property_se_dict)
    print("\nPredictions for " + dependent_param + ":")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df_final)
    


    
# =============================================================================
#                                CALL MAIN
# =============================================================================
if __name__=='__main__':
    main()


Predicted group contributions for DhyG:
{'C-[CX4H0](-C)(-C)-C': 0.0, 'C-[CX4H0](-C)(-C)[OH]': -9.7397, 'C-[CX4H1](-C)-C': -1.8282, 'C-[CX4H1](-C)[OH]': -6.4697, 'C-[CX4H2]-C': 0.6802, 'C-[CX4H3]': 3.8044, 'C-[OX2H]': -20.7489, 'O-[CX4H2]-C': -4.1285, 'O-[CX4H3]': -0.4111, 'material_point': 7.95}

Predicted standard error of group contributions for DhyG:
{'C-[CX4H0](-C)(-C)-C': 0.0, 'C-[CX4H0](-C)(-C)[OH]': 0.3819, 'C-[CX4H1](-C)-C': 0.2742, 'C-[CX4H1](-C)[OH]': 0.1678, 'C-[CX4H2]-C': 0.0423, 'C-[CX4H3]': 0.1988, 'C-[OX2H]': 0.2189, 'O-[CX4H2]-C': 0.1446, 'O-[CX4H3]': 0.3226, 'material_point': 0}

Predictions for DhyG:
                         actual  prediction  pred errs
compound                                              
methanol                 -13.21      -13.21       0.39
ethanol                  -13.00      -13.12       0.33
1-propanol               -12.38      -12.44       0.33
2-propanol               -11.93      -11.66       0.39
1-butanol                -11.88      -11.76


Predicted group contributions for DhyCp:
{'C-[CX4H0](-C)(-C)-C': -43.1599, 'C-[CX4H0](-C)(-C)[OH]': -24.8977, 'C-[CX4H1](-C)-C': 27.2356, 'C-[CX4H1](-C)[OH]': 4.968, 'C-[CX4H2]-C': 63.0299, 'C-[CX4H3]': 104.4104, 'C-[OX2H]': 64.6663, 'O-[CX4H2]-C': 35.2623, 'O-[CX4H3]': 49.3337, 'material_point': 0}

Predicted standard error of group contributions for DhyCp:
{'C-[CX4H0](-C)(-C)-C': 15.0115, 'C-[CX4H0](-C)(-C)[OH]': 11.5298, 'C-[CX4H1](-C)-C': 9.4197, 'C-[CX4H1](-C)[OH]': 4.9824, 'C-[CX4H2]-C': 1.2145, 'C-[CX4H3]': 5.0623, 'C-[OX2H]': 5.7679, 'O-[CX4H2]-C': 4.4956, 'O-[CX4H3]': 7.9319, 'material_point': 0}

Predictions for DhyCp:
                         actual  prediction  pred errs
compound                                              
methanol                  114.0      114.00       9.81
ethanol                   199.0      204.34       8.89
1-propanol                268.0      267.37       8.98
2-propanol                272.0      278.46      10.46
1-butanol                 335.0 