### Aqueous Organic Estimator Supplementary Notebook


In [1]:
import pandas as pd
import numpy as np
import sys
from math import floor, log10 # for significant figure rounding
import statsmodels.api as sm # for multilinear regression
from string import ascii_lowercase
import itertools
import copy
from statsmodels.formula.api import ols
import keyword

### Run only one of these three groups at a time

In [2]:
### ESTIMATE A NEW MATERIAL POINT
# save_as = 'all_new_new' 
# fixed_material_point = False 
# estimate_material_point = True 

### USE OLD MATERIAL POINT
# save_as = 'all_new_old' 
# fixed_material_point = True 

### DO NOT USE OR ESTIMATE A MATERIAL POINT
save_as = 'fixed'
fixed_material_point = True 
estimate_material_point = False 

In [3]:
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 27 09:36:15 2018
Modified on Mar 21, 2019 by Grayson

@author: Vincent
@author: Grayson
"""
file_name = 'primary_compiled_11-30-22_out.csv' #new groups
props = ['dGh','dHh','Vh','Cph'] #properties you are wanting to regress

material_point_dict = {"dGh":7.95,"dHh":-2.29, "Cph":0, "Vh":1.12} # material point value. Valid only if fixed_material_point = True. Values taken from Plyasunov and Shock 2000.

sig_figs = 3
# =============================================================================
#                                     MAIN
# =============================================================================
def main():
    for prop in props:
        if fixed_material_point:
            material_point = material_point_dict[prop]
        else:
            material_point = 0
        group_property_estimator(file_name, prop, sig_figs, material_point, estimate_material_point)
    
    
# =============================================================================
#                                  FUNCTIONS
# =============================================================================
round_to_n = lambda x, n: round(x, -int(floor(log10(abs(x)))) + (n - 1))

def iter_all_strings():
    for size in itertools.count(1):
        for s in itertools.product(ascii_lowercase, repeat=size):
            yield "".join(s)

def group_property_estimator(file_name, dependent_param, sf, material_point, estimate_material_point):   
    df_data = pd.read_csv(file_name)
    
    # remove rows containing only NA
#     df_data = df_data[np.isfinite(df_data[df_data.columns.values[len(df_data.columns.values)-1]])]
    
    # remove columns containing 0 groups
    df_data = df_data.loc[:, (df_data != 0).any(axis=0)]

    # get data subset that needs a prediction
    df_topred = df_data[np.isfinite(df_data[dependent_param]) == False]
    
    # get data subset that does not need a prediction
    df_not_topred = df_data[np.isfinite(df_data[dependent_param]) == True]
    
    # delete rows representing compounds that need a prediction when they contain a group that is not
    # represented in the training set.
    delrows = []
    for col in df_not_topred.columns.values:
        try:
            if sum(df_not_topred[col]) == 0:
                for row in df_topred.index.values:
                    if df_topred.loc[row, col] != 0:
                        delrows.append(row)
        except:
            pass
    bad_df = df_topred.index.isin(delrows)
    df_topred = df_topred[~bad_df]
    
    # remove rows without y values
    df_data = df_data[np.isfinite(df_data[dependent_param])]
    
    ## define the independent parameter
    X = df_data[[x for x in list(df_data.columns.values) if not x in ["compound", "formula", dependent_param]+props]]
    
    ## define the dependent parameter
    y = df_data[dependent_param]
    
    # get X of molecules to predict
    X_topred = df_topred[[x for x in list(df_topred.columns.values) if not x in ["compound", "formula", dependent_param]+props]]
    
    if not fixed_material_point:
        
        ## add an intercept to the multi reg
        if estimate_material_point:
            X["material point"] = 1
            X_topred["material point"] = 1
            
        multi_reg = sm.OLS(y[0:], X[0:]).fit() # perform the multiple regression
        prediction = multi_reg.predict(X) # make the predictions from the multi_reg
        preds = multi_reg.predict(X_topred)
        
        # set these variables to 0 regardless of whether material point is being estimated
        material_point = 0 
        material_point_err = 0
        
    else:
        
        # subtract material point from y values (dependent params)
        df_data["y_minus_Yo"] = df_data[dependent_param] - material_point
        
        # ols formulas can't handle SMARTS strings as variables,
        # so store them in a dictionary of simpler strings (generated iteratively)
        # in the form {label:SMARTS}
        label_dict = {}
        X_vars = list(X.columns.values)
        for i,s in enumerate(itertools.islice(iter_all_strings(), len(X_vars))):
            if s in keyword.kwlist: # ols cannot handle formulas containing python keywords like "as" or "in"
                s = s + "1"
            label_dict[s] = X_vars[i]
        
        # create an inverse label dictionary {SMARTS:label}
        inv_label_dict = {v: k for k, v in label_dict.items()}
        
        # rename dependent variable columns with new labels
        X = X.rename(columns=inv_label_dict)
        X_topred = X_topred.rename(columns=inv_label_dict)
        df_data = df_data.rename(columns=inv_label_dict)
        
        formula_str = "y_minus_Yo ~ " + " + ".join(list(X.columns.values)) + " -1"
        
        multi_reg = ols(formula=formula_str, data=df_data).fit()
        prediction = multi_reg.predict(X) # make the predictions from the multi_reg
        preds = multi_reg.predict(X_topred)
        
        # restore dependent variable column names
        X = X.rename(columns=label_dict)
        X_topred = X_topred.rename(columns=label_dict)
        df_data = df_data.rename(columns=label_dict)
        
#     print(multi_reg.summary()) # print out the statistics
#     print(multi_reg.params) 
#     print(multi_reg.params.values) # print param as an array

    group_property_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.params.values]))

    group_property_se_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.bse.values]))
    
    if fixed_material_point:
        group_property_dict["material_point"] = material_point
        group_property_se_dict["material_point"] = 0 # material point uncertainty fixed at 0 (see Plyasunov and Shock 2000)

        
    pred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X.loc[idx], X.columns.values)])**0.5 for idx in X.index]

    topred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X_topred.loc[idx], X_topred.columns.values)])**0.5 for idx in X_topred.index]
    
    
    
    comp_pred_df = pd.DataFrame({"compound":list(df_data["compound"]),
                                 "actual":df_data[dependent_param],
                                 "prediction":[round(pred+material_point, 2) for pred in prediction.values],
                                 "pred errs":[round(err, 2) for err in pred_errs]})
    

    df_preds = pd.DataFrame({"compound":list(df_topred["compound"]),
                                "actual":df_topred[dependent_param],
                                 "prediction":[round(pred+material_point, 2) for pred in preds.values],
                                 "pred errs":[round(err, 2) for err in topred_errs]})
    
    df_final = comp_pred_df.append(pd.DataFrame(data = df_preds))

    
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print("\nPredicted group contributions for " + dependent_param + ":")
        df_group_property = pd.DataFrame(group_property_dict.items(), columns=['group', 'value'])
        print(df_group_property)
        print("\nPredicted standard error of group contributions for " + dependent_param + ":")
        df_group_se = pd.DataFrame(group_property_se_dict.items(), columns=['group', 'std err'])
        print(df_group_se)
        print("\nPredictions for " + dependent_param + ":")
        print(df_final.to_string(index=False))
    
    if save_as != None:
        df_final.to_csv(save_as+"_"+dependent_param+".csv", index=False)
        df_group_property.to_csv(save_as+"_"+dependent_param+"_group_property.csv", index=False) #reports 0s when not able to estimate
        df_group_se.to_csv(save_as+"_"+dependent_param+"_group_se.csv", index=False)
        


    
# =============================================================================
#                                CALL MAIN
# =============================================================================
if __name__=='__main__':
    main()


Predicted group contributions for dGh:
                          group    value
0                C-[CH2R1!r3]-C   0.7962
1             C-[CHR1!r3](-C)-C  -1.3466
2      C-[CX4H0R1!r3](-C)(-C)-C  -1.6542
3              C=[CX3H1R1!r3]-C  -0.7999
4          C=[CX3H0R1!r3](-C)-C  -4.7447
5                   C-[CH2R2]-C  -1.4519
6                C-[CHR2](-C)-C  -0.8086
7         C-[CX4H0R2](-C)(-C)-C  -0.7851
8                 C=[CX3H1R2]-C  -2.6639
9             C=[CX3H0R2](-C)-C  -2.1083
10               C-[CHR3](-C)-C  -0.2867
11        C-[CX4H0R3](-C)(-C)-C   1.3100
12             [CX3H2R0]=[C!H2]   2.3545
13                C=[CX3H1R0]-C  -0.8882
14            C=[CX3H0R0](-C)-C  -5.0424
15                [CHR0]#[CHR0]  -0.1492
16                  C#[CX2H1R0]   0.1149
17                C-[CX2H0R0]#C  -4.2584
18                  C#[NX1H0R0]  -9.8886
19                C-[CX2H0R0]#N  -9.8886
20              [CH3R0]-[CH3R0]   7.6000
21             [C!H3]-[CX4H3R0]   3.3230
22               

In [None]:
# df = pd.read_csv('ORCHYD_properties_and_groups.csv')
# df