In [1]:
## avoid python warning if you are using > Python 3.11, using action="ignore"
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    warnings.warn("deprecated", DeprecationWarning)

## load packages
import os
import time
import copy
import pickle
import chardet
import numpy as np
import pandas as pd
from rdkit import Chem


from ML_Modeling import ML_Models

In [None]:
# ## get parameters from arguments
# fileName_in = './Kymera_ADME_PK_ALL_pull.csv'    # args.input
# sep = ','    # args.delimiter

# colName_mid = 'Compound Name'    # args.colId
# colName_smi = 'Smiles'    # args.colSmi
# colName_activity = 'ADME AlphaLogD;Mean;AlphaLogD;(Num)'    # args.colAssay

# desc_fps = True
# desc_rdkit = True
# desc_cx = False

# split_method = 'random'
# CV = 10
# rng = 666666
# hasVal = True
# colName_date = 'Created On'

In [2]:
with open('./tmp_folder/ML_dataSet.ds', 'rb') as dsfh:
    ML_dataSet = pickle.load(dsfh)

with open('./tmp_folder/ML_dataSet_norm.ds', 'rb') as dsfh:
    ML_dataSet_norm = pickle.load(dsfh)

with open('./tmp_folder/processor.model', 'rb') as pfh:
    data_processor = pickle.load(pfh)

In [14]:
sorted(ML_dataSet.y_Training['low_logD'].unique())

[0.0, 1.0]

In [4]:
ML_dataSet_norm.keys()

dict_keys(['Training_X', 'Validation_X', 'Test_X', 'Training_y', 'Validation_y', 'Test_y'])

In [9]:
ML_dataSet_norm['Training_y']

Unnamed: 0,y_preprocess
2,
3,
4,
5,
6,
7,
11,
12,
13,
14,


In [None]:
len(ML_dataSet_norm['Training_y'].columns)

In [None]:
10**(np.log10(100))

In [None]:
ML_dataSet_norm['Training_y']

In [None]:
ML_dataSet_norm.

In [None]:
ML_dataSet.y_Training

In [None]:
ML_dataSet.X_Training.head(3)

In [None]:
ML_dataSet_norm['Training_X']

In [None]:
# y_train_raw = my_preProcessor.PreProcess_y(y_train, take_log_y=take_log_y, train=False)

In [None]:
class Classification_Model(object):
    ## <===================== model initiation =====================>
    def __init__(self,  myScikitModel=None, modelName='Classification_Model', rng=666666, n_jobs=-1):
        assert myScikitModel is not None, f"\tWarning! Please define an initiated RDKit ML model"
        self._name = modelName
        self._rng = rng
        self._n_jobs = n_jobs
        self.model = myScikitModel
        self.HPT_Results = {}
        self.predictions = None
        self.performance = {}
        self.plots = {}
            
    ## <===================== model training =====================>
    def Train(self, X, y, printLog=True, HPT=False, search_space=None):
        ## count time
        beginTime = time.time()
        ## ----------------------------------------------------------------
        ## ------------ hyper parameter search ------------
        if HPT:
            self._HyperParamSearch(X, y, search_space=search_space, printLog=printLog)
        
        ## ------------ fit the model ------------
        self.model.fit(X, y)
       
        ## ----------------------------------------------------------------
        print(f"\tModel construction costs time = {(time.time()-beginTime):.2f} s ................")
        return None

    ## <===================== model evaluation =====================>
    def Evaluate(self, X, y, ds_label='TBD', printLog=True, plotResult=False, saveResults=False):
        ## make prediction
        y_pred = self.model.predict(X)    #####################

        ## save prediction
        df_predictions = copy.deepcopy(y)
        df_predictions['Experiment'] = df_predictions[y.columns[0]]
        df_predictions['DataSet'] = ds_label
        df_predictions['Prediction'] = y_pred
        self.predictions = pd.concat([self.predictions, df_predictions]) if self.predictions is not None else df_predictions

        ## calcualte statistics
        print(f"\tEvaluation results of the {ds_label} dataset:")
        self.performance[ds_label] = self._CalcScores(y_pred=y_pred, y_true=y.to_numpy(), printLog=printLog)    #######################################
        
        ## plotting
        if plotResult:
            self.plots[ds_label] = self._Plot_Pred_VS_Expt(dataTable=df_predictions,
                                                           label_x='Prediction', 
                                                           label_y='Experiment',
                                                           color_by='DataSet',
                                                           figTitle=f"Pred VS Expt ({ds_label})")
        return None
    
    ## <===================== HPTunning =====================>
    def _HyperParamSearch(self, X, y, search_space=None, search_method='grid', scoring='roc_auc', nFolds=5, printLog=True):
        ## count time
        beginTime = time.time()
        ## --------------------------------
        print(f"\tStart Hyper-Parameter Tunning ...")
        SearchResults = {'best_model': None, 'best_score':None, 'best_param':None}
        
        ##
        if search_method == 'grid':
            optimizer = GridSearchCV(estimator=self.model, param_grid=search_space, scoring=scoring, cv=nFolds, n_jobs=self._n_jobs)
        elif search_method =='Bayes':
            optimizer = GridSearchCV(estimator=self.model, param_grid=search_space, scoring=scoring, cv=nFolds, n_jobs=self._n_jobs)
        else:
            optimizer = GridSearchCV(estimator=self.model, param_grid=search_space, scoring=scoring, cv=nFolds, n_jobs=self._n_jobs)

        ## fit the Optimizer to the Data
        y_reshaped = y.to_numpy().reshape((len(y), ))
        optimizer.fit(X, y_reshaped)

        ## search results
        SearchResults['best_model'] = optimizer.best_estimator_
        SearchResults['best_score'] = optimizer.best_score_
        SearchResults['best_param'] = SearchResults['best_model'].get_params()
        self.HPT_Results[search_method] = SearchResults
        
        ##
        # self.model = optimizer.best_estimator_
        if SearchResults['best_param'] is not None:
            self.model.set_params(**SearchResults['best_param'])
        else:
            self.model = self.model

        if printLog:
            print(f"\tThis is the log info")
            print(f"\tThe best {scoring}: {SearchResults['best_score']}")
            print(f"\tThe optimized Params: {SearchResults['best_param']}")
            ## ----------------------------------------------------------------
            print(f"\tHyper-parameters Tunning costs time = {(time.time()-beginTime):.2f} s ................")
        return None
    
    ## <===================== tools =====================>
    def _CalcScores(self, y_pred, y_true, printLog=True):   
        dataDict_result = {}
        try:
            y_pred = y_pred.reshape((len(y_pred), ))
            y_true = y_true.reshape((len(y_true), ))
        except Exception as e:
            print(f"\tError! Cannot reformatting the y_pred and y_true when calculating the statistics")
        else:
            ## calculate the mean absolute error using Scikit learn
            try:
                dataDict_result['MAE'] = mean_absolute_error(y_true, y_pred)
            except:
                dataDict_result['MAE'] = np.nan
            
            ## calculate the PearsonCorrelationCoefficient
            try:
                pr_np = np.corrcoef(y_pred, y_true)[1, 0]
                dataDict_result['Pearson_R2'] = pr_np * pr_np
            except:
                dataDict_result['Pearson_R2'] = np.nan

            ## calculate the rank-order correlation (Spearman's rho)
            try:
                sr_sp, sp_sp = spearmanr(y_pred, y_true)[0], spearmanr(y_pred, y_true)[1]
                dataDict_result['Spearman_R2'] = sr_sp * sr_sp
            except:
                dataDict_result['Spearman_R2'], sp_sp = np.nan, np.nan
                        
            ## calculate the # Kendall's tau
            try:
                kr_sp, kp_sp = kendalltau(y_pred, y_true)[0] , kendalltau(y_pred, y_true)[1]
                dataDict_result['KendallTau_R2'] = kr_sp * kr_sp
            except:
                dataDict_result['KendallTau_R2'], kp_sp = np.nan, np.nan
             
            ## print out the results
            if printLog:
                print(f"\t\tData shape: y_pred {y_pred.shape}; y_true {y_true.shape}")
                print(f"\t\tMean absolute error: {dataDict_result['MAE']:.2f}")
                print(f"\t\tPearson-R2: {dataDict_result['Pearson_R2']:.2f}")
                print(f"\t\tSpearman-R2: {dataDict_result['Spearman_R2']:.2f} (p={sp_sp:.2f})")
                print(f"\t\tKendall-R2: {dataDict_result['KendallTau_R2']:.2f} (p={kp_sp:.2f})")
        return dataDict_result
    
    def _Plot_Pred_VS_Expt(self, dataTable, label_x='Prediction', label_y='Experiment', color_by=None, diagonal=True, sideHist=True, figTitle=None):
        x, y = dataTable[label_x], dataTable[label_y]
        ## --------- Start with a square Figure ---------
        fig = plt.figure(figsize=(8, 8))

        if sideHist:
            gs = fig.add_gridspec(2, 2,  width_ratios=(4, 1), height_ratios=(1, 4), left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.05, hspace=0.05)
            ax = fig.add_subplot(gs[1, 0])
            ## --------- add hist ---------
            if sideHist:
                ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
                ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)

                bins = 10
                ax_histx.hist(x, bins=bins)
                ax_histy.hist(y, bins=bins, orientation='horizontal')
            
                ax_histx.tick_params(axis="x", labelbottom=False)    # no x labels
                ax_histy.tick_params(axis="y", labelleft=False)    # no y labels

                ax_histx.tick_params(axis='both', which='major', labelsize=16)
                ax_histy.tick_params(axis='both', which='major', labelsize=16)
        else:
            ax = fig.add_subplot()
        
        ## --------- add plot ---------
        if color_by is None:
            ax.scatter(x, y, s=40, alpha=0.5, cmap='Spectral', marker='o')
        else:
            for i in sorted(dataTable[color_by].unique()):
                idx = dataTable[dataTable[color_by]==i].index.to_list()
                ax.scatter(x.loc[idx], y.loc[idx], s=40, alpha=0.5, cmap='Spectral', marker='o', label=i)
            ax.legend(loc="upper left", title=color_by)    #, bbox_to_anchor=(1.35, 0.5)
        
        ## figure label and title
        ax.set_xlabel(label_x, fontsize=16)
        ax.set_ylabel(label_y, fontsize=16)

        # now determine nice limits:
        # ax_max = np.ceil(max(np.max(x), np.max(y)))
        # ax_min = np.floor(min(np.min(x), np.min(y)))
        ax_max = max(np.max(x), np.max(y))
        ax_min = min(np.min(x), np.min(y))
        ax_addon = (ax_max - ax_min)/10
        ax_max = ax_max + ax_addon
        ax_min = ax_min - ax_addon
        ax.set_xlim([ax_min, ax_max])
        ax.set_ylim([ax_min, ax_max])
        ax.tick_params(axis='both', which='major', labelsize=16)
        ax.grid(alpha=0.75)

        if diagonal: 
            diagonalLine = ax.plot([ax_min, ax_max], [ax_min, ax_max], c='lightgray', linestyle='-')
            # fold1Line1 = ax.plot([ax_min+1, ax_max], [ax_min, ax_max-1], c='lightgray', linestyle='--')
            # fold1Line2 = ax.plot([ax_min, ax_max-1], [ax_min+1, ax_max], c='lightgray', linestyle='--')
        
        figTitle = f"Pred vs Expt)" if figTitle is None else figTitle
        fig.suptitle(figTitle, fontsize=24)
        return fig

    def ___futureFunctionsTBA():
        return None

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# search_space = {'max_depth': [2, 4, 6, 8]}

# Model_RF = build_ML_Model(dataDict_ds=dataDict_ds, 
#                             sk_model=RandomForestRegressor(random_state=rng, oob_score=True), 
#                             modelName="RandomForest",
#                             rng=rng, 
#                             n_jobs=-1,
#                             HPT=True,
#                             search_space=search_space,
#                             retrain_by_all=False)

In [None]:
ML_model.predictions

In [None]:
ML_model.performance

In [None]:
ML_model.HPT_Results

In [None]:
ML_model.plots

In [None]:
ML_model.plots['All']