### Aqueous Organic Estimator Supplementary Notebook

Authors: Grayson Boyer, Vincent Milesi

A notebook for estimating hydration properties of molecules from scratch. Breaks molecules with known properties into second-order groups, solves for the contribution of each group with multilinear regression, and then estimates the properties of molecules from those groups.

Date modified: 7/13/2020

In [1]:
file_name = 'organic_second_order_alc_test'
props = ["DhyG", "DhyH", "DhyCp", "V"]
sig_figs = 3

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 27 09:36:15 2018
Modified on Mar 21, 2019 by Grayson

@author: Vincent
@author: Grayson
"""

# =============================================================================
#                               IMPORT PACKAGES
# =============================================================================

import pandas as pd
import numpy as np
import sys
from math import floor, log10 # for significant figure rounding
import statsmodels.api as sm # for multilinear regression


# =============================================================================
#                                     MAIN
# =============================================================================
def main():
    for prop in props:
        group_property_estimator(file_name, prop, sig_figs)
    
    
# =============================================================================
#                                  FUNCTIONS
# =============================================================================
round_to_n = lambda x, n: round(x, -int(floor(log10(abs(x)))) + (n - 1))

def group_property_estimator(file_name, dependent_param, sf):   
    df_data = pd.read_csv(file_name + '.csv', index_col=0)  
    
    # remove rows with containing only NaN
    df_data = df_data[np.isfinite(df_data[df_data.columns.values[len(df_data.columns.values)-1]])]
    
    # remove columns containing 0 groups
    df_data = df_data.loc[:, (df_data != 0).any(axis=0)]
    
    # get data subset that needs a prediction
    df_topred = df_data[np.isfinite(df_data[dependent_param]) == False]
    
    # remove rows without y values
    df_data = df_data[np.isfinite(df_data[dependent_param])]
    
    ## define the independent parameter
    X = df_data[[x for x in list(df_data.columns.values) if not x in ["compound", "formula", dependent_param]+props]]

    ## define the dependent parameter
    y = df_data[dependent_param]
    
    # get X of molecules to predict
    X_topred = df_topred[[x for x in list(df_topred.columns.values) if not x in ["compound", "formula", dependent_param]+props]]
    
    
    multi_reg = sm.OLS(y[0:], X[0:]).fit() # perform the multiple regression
    prediction = multi_reg.predict(X) # make the predictions from the multi_reg
    preds = multi_reg.predict(X_topred)
    
#     print(multi_reg.summary()) # print out the statistics
#     print(multi_reg.params) 
#     print(multi_reg.params.values) # print param as an array
    
    
    group_property_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.params.values]))
    
    group_property_se_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.bse.values]))
    

    pred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X.loc[idx], X.columns.values)])**0.5 for idx in X.index]
    
    topred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X_topred.loc[idx], X_topred.columns.values)])**0.5 for idx in X_topred.index]

    
    comp_pred_df = pd.DataFrame({"actual":df_data[dependent_param],
                                 "prediction":[round(pred, 2) for pred in prediction.values],
                                 "pred errs":[round(err, 2) for err in pred_errs]})
    
    df_preds = pd.DataFrame({"actual":df_topred[dependent_param],
                                 "prediction":[round(pred, 2) for pred in preds.values],
                                 "pred errs":[round(err, 2) for err in topred_errs]})

    df_final = comp_pred_df.append(pd.DataFrame(data = df_preds))
    
    print("\nPredicted group contributions for " + dependent_param + ":")
    print(group_property_dict)
    print("\nPredicted standard error of group contributions for " + dependent_param + ":")
    print(group_property_se_dict)
    print("\nPredictions for " + dependent_param + ":")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df_final)

    
# =============================================================================
#                                CALL MAIN
# =============================================================================
if __name__=='__main__':
    main()


Predicted group contributions for DhyG:
{'C-[CX4H0](-C)(-C)-C': 0.0, 'C-[CX4H0](-C)(-C)[OH]': 0.5772, 'C-[CX4H1](-C)-C': 3.0649, 'C-[CX4H1](-C)[OH]': -1.1221, 'C-[CX4H2]-C': 0.7191, 'C-[CX4H3]': -0.9938, 'C-[OX2H]': -8.6505, 'O-[CX4H2]-C': -3.546, 'O-[CX4H3]': -4.5595}

Predicted standard error of group contributions for DhyG:
{'C-[CX4H0](-C)(-C)-C': 0.0, 'C-[CX4H0](-C)(-C)[OH]': 0.0923, 'C-[CX4H1](-C)-C': 0.1628, 'C-[CX4H1](-C)[OH]': 0.1102, 'C-[CX4H2]-C': 0.0552, 'C-[CX4H3]': 0.0534, 'C-[OX2H]': 0.1068, 'O-[CX4H2]-C': 0.1233, 'O-[CX4H3]': 0.1657}

Predictions for DhyG:
                       actual  prediction  pred errs
compound                                            
methanol               -13.21      -13.21       0.20
ethanol                -13.00      -13.19       0.17
1-propanol             -12.38      -12.47       0.18
2-propanol             -11.93      -11.76       0.17
1-butanol              -11.88      -11.75       0.19
2-butanol              -11.38      -11.04       0.