In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

The goal of this project is to process real-life experimental data and provide solvent suggestion based on selected descriptors

In [45]:
class SolvPredPlus:
    
    def __init__(self, all_cand_param_csv):
        self.load_cand_param(all_cand_param_csv)
        pass
    
    def load_cand_param(self, all_cand_param_csv):
        """
        load database for all solvent parameters for candidates
        all_cand_param_csv: string, file name.
        The column row in the example includes: solvent, D, P, H, epsilon, pi_star, HBD, HBA, logP_solv
        """
        all_solv_cand_df = pd.read_csv(all_cand_param_csv)
        #convert to dictionary
        self.all_cand_param_dict = all_solv_cand_df.to_dict(orient='records')
    
    def load_exp_data(self, ip_exp_csv):
        """
        ip_exp_csv is the file name of input experimental data.
        The columns of the data file should follow the naming rule as rxn_id, solv_1, precent_1, solv_2, percent_2, Y
        There could be more co-solvents column; the name should follow solv_#, precent_# structure.
        Y is the observable column, in the example is the yield.
        Important rule: the name for solvent must match the name used in the all_cand_param data file.
        Mixtures parameters will be calculated and attached in each entry.
        """
        full_exp_df = pd.read_csv(ip_exp_csv)
        self.all_co_solv_col_name = self._get_co_solv_col_name(full_exp_df)
        full_exp_dict = full_exp_df.to_dict(orient="records")
        
        #operate each row/entry one by one
        #fetch parameters of corresponding solvent from database
        #then scale by percentage
        all_comb_exp = []
        
        for exp in full_exp_dict:
            
            for i, solv_col in enumerate(self.all_co_solv_col_name["solv_col"]):
                this_solv_name = exp[solv_col]
                
                if type(this_solv_name) is str:
                
                    percent_col = self.all_co_solv_col_name["percent_col"][i]
                    this_solv_percent = exp[percent_col]

                    #fetch all parameters for this solvent
                    this_solv_params = self._fetch_solv_param(this_solv_name)["full_params"]
                    #apply this_solv_percent to all parameters
                    this_solv_scaled_params = self._scale_param_by_percent(this_solv_params, this_solv_percent)

                    if i == 0:
                        mix_solv_params = this_solv_scaled_params
                    else:
                        mix_solv_params = self._add_dict(this_solv_scaled_params, mix_solv_params)
            
            #include mix_solv_params to the experimental dataset
            comb_exp_dict = exp
            comb_exp_dict.update(mix_solv_params)
            
            all_comb_exp.append(comb_exp_dict)
        
        self.all_comb_exp = all_comb_exp
                
        return self.all_comb_exp
        
        
        
    #parse column name and extract solv, precent part from the full columns
        
    def _get_co_solv_col_name(self, full_exp_df):
        """
        look at how many co-solvents are used in experiment.
        return column names for the solv/percent block
        """
        all_col = full_exp_df.columns
        #confirm how many co-solvents to parse
        all_co_solv = {
            "solv_col":[],
            "percent_col":[]
        }

        for i, col in enumerate(all_col):
            if "solv" in col.lower():
                this_solv_col = col
                this_percent_col = all_col[i+1]
                all_co_solv["solv_col"].append(this_solv_col)
                all_co_solv["percent_col"].append(this_percent_col)

        if len(all_co_solv["solv_col"]) == 0 or len(all_co_solv["percent_col"]) == 0:
            raise ValueError("Co-solvent or percent column was missing. Please check the input experimental data file.\n")
        elif len(all_co_solv["solv_col"]) != len(all_co_solv["percent_col"]):
            raise ValueError("Potential offset was detected in co-solvent and percentage info. Please check the input experimental data file.\n")
        
        #get total number of solvents in the mixture
        self.total_num_of_solv = len(all_co_solv["solv_col"])
        
        return all_co_solv

            
    
    def _scale_param_by_percent(self, full_param_dict, this_solv_percent):
        """
        scale each parameter by the percentage of current solvent
        """
        scaled_num_params = {}
        for item in full_param_dict.keys():
            this_value = full_param_dict[item]
            if type(this_value) is not str:
                scaled_num_params[item] = this_solv_percent * this_value
        return scaled_num_params
    
    def _add_dict(self, dict_1, dict_2):
        """
        dict_1 and dict_2 have same structure
        """
        mix_dict = {}
        for item in dict_1.keys():
            mix_dict[item] = dict_1[item] + dict_2[item]
        return mix_dict
        
        
        
        
        
    
    def _fetch_solv_param(self, solv_name):
        """
        fetch solvent parameters by solv_name
        """
        
        this_solv_params = {}
#         print(solv_name)
        
        for entry in self.all_cand_param_dict:
            #print(entry)
            
            if solv_name.lower() == entry["solvent"].lower():
                this_solv_params["this_solvent"] = solv_name
                this_solv_params["full_params"] = entry
        
#         print(this_solv_params)
        if not this_solv_params["this_solvent"]:
            raise ValueError("Target solvent absent from candidate database. Please check the solvent name or the database entry.")
        
        return this_solv_params

In [46]:
ip_exp_csv_name = "input_exp_data.csv"
all_cand_param_name = "all_candidates_param_at_rt.csv"

In [47]:
spp = SolvPredPlus(all_cand_param_name)
spp.load_exp_data(ip_exp_csv_name)


[{'rxn_id': 'Xf2346',
  'solv_1': 'DMAC',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.82,
  'D': 16.8,
  'P': 11.5,
  'H': 10.2,
  'epsilon': 37.78,
  'pi_star': 0.88,
  'HBD': 0.0,
  'HBA': 0.76,
  'logP_solv': -0.77},
 {'rxn_id': 'Xf2353',
  'solv_1': 'Acetone',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.72,
  'D': 15.5,
  'P': 10.4,
  'H': 7.0,
  'epsilon': 20.7,
  'pi_star': 0.6,
  'HBD': 0.1,
  'HBA': 0.5,
  'logP_solv': -0.21},
 {'rxn_id': 'Xf2345',
  'solv_1': 'DMSO',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.7,
  'D': 18.4,
  'P': 16.4,
  'H': 10.2,
  'epsilon': 46.45,
  'pi_star': 1.0,
  'HBD': 0.0,
  'HBA': 0.76,
  'logP_solv': -1.38},
 {'rxn_id': 'Xf2344',
  'solv_1': 'DMF',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.62,
  'D': 17.4,
  'P': 13.7,
  'H': 11.3,
  'epsilon': 36.7,
  'pi_star': 0.88,
  'HBD': 0.0,
  'HBA': 0.69,
  'logP_solv': -1.31},
 {'rxn_id': 'Xf2352',
  'solv_1': 'AC

In [28]:
all_solv_cand_df = pd.read_csv(all_cand_param_name)
all_solv_cand_df.to_dict(orient = "records")

[{'solvent': 'DMAC',
  'D': 16.8,
  'P': 11.5,
  'H': 10.2,
  'epsilon': 37.78,
  'pi_star': 0.88,
  'HBD': 0.0,
  'HBA': 0.76,
  'logP_solv': -0.77},
 {'solvent': 'Acetone',
  'D': 15.5,
  'P': 10.4,
  'H': 7.0,
  'epsilon': 20.7,
  'pi_star': 0.6,
  'HBD': 0.1,
  'HBA': 0.5,
  'logP_solv': -0.21},
 {'solvent': 'DMSO',
  'D': 18.4,
  'P': 16.4,
  'H': 10.2,
  'epsilon': 46.45,
  'pi_star': 1.0,
  'HBD': 0.0,
  'HBA': 0.76,
  'logP_solv': -1.38},
 {'solvent': 'DMF',
  'D': 17.4,
  'P': 13.7,
  'H': 11.3,
  'epsilon': 36.7,
  'pi_star': 0.88,
  'HBD': 0.0,
  'HBA': 0.69,
  'logP_solv': -1.31},
 {'solvent': 'ACN',
  'D': 15.3,
  'P': 18.0,
  'H': 6.1,
  'epsilon': 37.5,
  'pi_star': 0.75,
  'HBD': 0.19,
  'HBA': 0.4,
  'logP_solv': -0.39},
 {'solvent': 'NPA',
  'D': 16.0,
  'P': 6.8,
  'H': 17.4,
  'epsilon': 20.45,
  'pi_star': 0.52,
  'HBD': 0.84,
  'HBA': 0.9,
  'logP_solv': 0.36},
 {'solvent': 'tBuOH',
  'D': 15.2,
  'P': 5.1,
  'H': 14.7,
  'epsilon': 12.0,
  'pi_star': 0.41,
  'HBD

In [3]:
full_exp_df = pd.read_csv(ip_exp_csv_name)
full_exp_df.head()

Unnamed: 0,rxn_id,solv_1,percent_1,solv_2,percent_2,Y
0,Xf2346,DMAC,1.0,,,0.82
1,Xf2353,Acetone,1.0,,,0.72
2,Xf2345,DMSO,1.0,,,0.7
3,Xf2344,DMF,1.0,,,0.62
4,Xf2352,ACN,1.0,,,0.52


In [34]:
full_exp_df.to_dict(orient = "records")

[{'rxn_id': 'Xf2346',
  'solv_1': 'DMAC',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.82},
 {'rxn_id': 'Xf2353',
  'solv_1': 'Acetone',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.72},
 {'rxn_id': 'Xf2345',
  'solv_1': 'DMSO',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.7},
 {'rxn_id': 'Xf2344',
  'solv_1': 'DMF',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.62},
 {'rxn_id': 'Xf2352',
  'solv_1': 'ACN',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.52},
 {'rxn_id': 'Xf2350',
  'solv_1': 'NPA',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.26},
 {'rxn_id': 'Xf2351',
  'solv_1': 'tBuOH',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.14},
 {'rxn_id': 'Xf2356',
  'solv_1': 'EA',
  'percent_1': 1.0,
  'solv_2': nan,
  'percent_2': nan,
  'Y': 0.01},
 {'rxn_id': 'xf2379',
  'solv_1': 'EA',
  'percent_1': 0.2,
  'solv_2': 'acetone',
  'percent_2': 

In [12]:
all_col = full_exp_df.columns

In [15]:
for i, col in enumerate(all_col):
    if "solv" in col.lower():
        this_solv = col
        this_precent = all_col[i+1]
        print(this_solv)
        print(this_precent)

solv_1
percent_1
solv_2
percent_2


In [24]:
#parse column name and extract solv, precent part
def parse_co_solv(full_exp_df):
    all_col = full_exp_df.columns
    
    all_co_solv = {
        "solv_col":[],
        "percent_col":[]
    }
    
    for i, col in enumerate(all_col):
        if "solv" in col.lower():
            this_solv_col = col
            this_percent_col = all_col[i+1]
            all_co_solv["solv_col"].append(this_solv_col)
            all_co_solv["percent_col"].append(this_percent_col)
    
    return all_co_solv
            

In [25]:
all_co_solv = parse_co_solv(full_exp_df)
print(all_co_solv)

{'solv_col': ['solv_1', 'solv_2'], 'percent_col': ['percent_1', 'percent_2']}
