In [None]:
!pip install rdkit-pypi
!pip install graphviz
!pip install xgboost
!pip install optuna
!pip install padelpy
!wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
!unzip fingerprints_xml.zip

# 5.1.1 Calculate the PaDEL descriptors

In [None]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

fp = dict(zip(FP_list, xml_files))



In [None]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('/content/regression_equal_no_ion_5_smaller.csv')
df2 = pd.read_csv('/content/eli_duplcate_hf.csv')
d1= dict(zip(df1['SMILES'], df1['STANDARD_VALUE_LN']))
d2= dict(zip(df2['smiles'], [d1[smi] for smi in df2['smiles']]))
d3= dict(zip(df1['SMILES'], df1['CHEMBL_ID']))
d4= dict(zip(df2['smiles'], [d3[smi] for smi in df2['smiles']]))


df4= pd.DataFrame()
df4['half_life'], df4['SMILES'], df4['CHEML_ID'] = np.array([d1[smi] for smi in df2['smiles']]), df2['smiles'], np.array([d3[smi] for smi in df2['smiles']])
# save .smi file for padel descriptor
df4['SMILES'].to_csv('molecule.smi', sep='\t', index=False, header=False)


In [None]:
from padelpy import padeldescriptor

fingerprints= ['EState', 'CDKextended', 'CDK', 'KlekotaRoth', 'MACCS', 'PubChem']
for fingerprint in fingerprints:
  fingerprint_output_file = ''.join([fingerprint,'.csv']) #PubChem.csv
  fingerprint_descriptortypes = fp[fingerprint]
  print(fingerprint_descriptortypes)
  padeldescriptor(  mol_dir='molecule.smi',
            d_file=fingerprint_output_file, #'PubChem.csv'
            descriptortypes= fingerprint_descriptortypes,
            detectaromaticity=True,
            standardizenitro=True,
            standardizetautomers=True,
            threads=8,
            removesalt=False,
            log=True,
            fingerprints=True )

#add  SMILES and CHEML_ID columns to result csv files
for fp in fingerprints:
  descriptors = pd.read_csv(f'{fp}.csv')
  descriptors.insert(loc= 0, column = 'smiles', value = df4['SMILES'])
  descriptors.insert(loc= 0, column = 'Chembl_ID', value = df4['CHEML_ID'])
  print(descriptors)
  descriptors.to_csv(f'{fp}_FP_withID.csv')


# 5.1.2 Calculate the RDKit descriptors

In [None]:
# Ref: https://github.com/rdkit/benchmarking_platform/blob/master/scoring/fingerprint_lib.py

from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.Avalon import pyAvalonTools as fpAvalon
from rdkit.Chem.AtomPairs import Pairs, Torsions
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.ChemicalFeatures import BuildFeatureFactory
from rdkit.Chem import rdMolDescriptors

# implemented fingerprints:
# ECFC0 (ecfc0), ECFP0 (ecfp0), MACCS (maccs),
# atom pairs (ap), atom pairs bit vector (apbv), topological torsions (tt)
# hashed atom pairs (hashap), hashed topological torsions (hashtt) --> with 1024 bits
# ECFP4 (ecfp4), ECFP6 (ecfp6), ECFC4 (ecfc4), ECFC6 (ecfc6) --> with 1024 bits
# FCFP4 (fcfp4), FCFP6 (fcfp6), FCFC4 (fcfc4), FCFC6 (fcfc6) --> with 1024 bits
# Avalon (avalon) --> with 1024 bits
# long Avalon (laval) --> with 16384 bits
# long ECFP4 (lecfp4), long ECFP6 (lecfp6), long FCFP4 (lfcfp4), long FCFP6 (lfcfp6) --> with 16384 bits
# RDKit with path length = 5 (rdk5), with path length = 6 (rdk6), with path length = 7 (rdk7)
# 2D pharmacophore (pharm) ?????????????

nbits = 1024
longbits = 16384

# dictionary
fpdict = {}
fpdict['ecfp0'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 0, nBits=nbits)
fpdict['ecfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=nbits)
fpdict['ecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nbits)
fpdict['ecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=nbits)
fpdict['ecfc0'] = lambda m: AllChem.GetMorganFingerprint(m, 0)
fpdict['ecfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1)
fpdict['ecfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2)
fpdict['ecfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3)
fpdict['fcfp2'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 1, useFeatures=True, nBits=nbits)
fpdict['fcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=nbits)
fpdict['fcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=nbits)
fpdict['fcfc2'] = lambda m: AllChem.GetMorganFingerprint(m, 1, useFeatures=True)
fpdict['fcfc4'] = lambda m: AllChem.GetMorganFingerprint(m, 2, useFeatures=True)
fpdict['fcfc6'] = lambda m: AllChem.GetMorganFingerprint(m, 3, useFeatures=True)
fpdict['lecfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=longbits)
fpdict['lecfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=longbits)
fpdict['lfcfp4'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 2, useFeatures=True, nBits=longbits)
fpdict['lfcfp6'] = lambda m: AllChem.GetMorganFingerprintAsBitVect(m, 3, useFeatures=True, nBits=longbits)
fpdict['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m)
fpdict['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m)
fpdict['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m)
fpdict['hashap'] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(m, nBits=nbits)
fpdict['hashtt'] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits=nbits)
fpdict['avalon'] = lambda m: fpAvalon.GetAvalonFP(m, nbits)
fpdict['laval'] = lambda m: fpAvalon.GetAvalonFP(m, longbits)
fpdict['rdk5'] = lambda m: Chem.RDKFingerprint(m, maxPath=5, fpSize=nbits, nBitsPerHash=2)
fpdict['rdk6'] = lambda m: Chem.RDKFingerprint(m, maxPath=6, fpSize=nbits, nBitsPerHash=2)
fpdict['rdk7'] = lambda m: Chem.RDKFingerprint(m, maxPath=7, fpSize=nbits, nBitsPerHash=2)


def CalculateFP(fp_name, smiles):
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        raise ValueError('SMILES cannot be converted to a RDKit molecules:', smiles)

    return fpdict[fp_name](m)

# 5.2 Select suitable fingerprint by KL divergence

In [None]:
import random
import os

def set_seed(seed: int = 42) -> None:
  np.random.seed(seed)
  random.seed(seed)
  # Set a fixed value for the hash seed
  os.environ["PYTHONHASHSEED"] = str(seed)
  print(f"Random seed set as {seed}")

def cv2arr(df):
  result=[]
  for i in range(len(df)):
    arr=np.array(df.iloc[i,1:], dtype=int)
    result.append(arr)

  return result

In [None]:
fp1= pd.read_csv('/content/PubChem.csv')
pubchemfp=cv2arr(fp1)

fp2= pd.read_csv('/content/CDK.csv')
cdk_fp=cv2arr(fp2)

fp3= pd.read_csv('/content/CDKextended.csv')
cdkE_fp=cv2arr(fp3)

fp4= pd.read_csv('/content/EState.csv')
Es_fp=cv2arr(fp4)

fp5= pd.read_csv('/content/KlekotaRoth.csv')
kl_fp=cv2arr(fp5)

fp6= pd.read_csv('/content/MACCS.csv')
macc_fp=cv2arr(fp6)

In [None]:
import pandas as pd
import numpy as np

#df2 = pd.read_csv('/content/eli_duplcate_hf.csv')
smi=df2['smiles']

fcfp4=[]
fcfp6=[]
ecfp4=[]
ecfp6=[]
for i in smi:
  fcfp4.append(np.array(CalculateFP('fcfp4',i)))
  fcfp6.append(np.array(CalculateFP('fcfp6',i)))
  ecfp4.append(np.array(CalculateFP('ecfp4',i)))
  ecfp6.append(np.array(CalculateFP('ecfp6',i)))

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def plot_tsne(fp, color):
  estim= TSNE(metric='jaccard', init='random',  random_state=8)
  projection = estim.fit_transform(fp)
  # kl_divergence_ lower is better
  print('kl_divergence: ', estim.kl_divergence_)
  plt.scatter(*projection.T, marker=".", s=30, lw=0, alpha=0.7, c=color, edgecolor="k")
  plt.show()

for i in [cdk_fp, cdkE_fp, Es_fp, kl_fp, macc_fp, pubchemfp, fcfp4, fcfp6, ecfp4, ecfp6]:
  print(f'{i}')
  plot_tsne(np.array(i), 'b')

In [None]:
from scipy.spatial.distance import pdist, squareform

def Calculate_tanimoto_similarity(fp:np.array):
  fingerprint = np.array(fp)
  # If attributes in arrays  are all binary, the Tanimoto simiarity reduces to the Jaccard simiarity,
  # In this case, fingeprints are arrays consisted of binary values
  jaccard_distance= pdist(fingerprint, 'jaccard')
  # get square matrix
  sq_jd= squareform(jaccard_distance)
  jsim= 1- sq_jd

  return jsim

# 5.3 Random seed selection for ML methods

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

df= pd.read_csv('/content/eli_duplcate_hf.csv')

X=np.array(pubchemfp)
y=df['half_life']

def svr_seed_select(start_point, end_point, points, X, y):
  svmr=[]
  intr=[]
  for i in np.random.randint(start_point,end_point, size=points):
    intr.append(i)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state= i)
    svm= SVR(max_iter=-1)
    svm.fit(X_train,y_train)
    svm_p= svm.predict(X_val)
    svm_rmse= mean_squared_error(y_val, svm_p, squared=False)
    svmr.append(svm_rmse)

  ids= np.argmin(np.array(svmr))
  print('best svr results: ', svmr[ids])
  print('best seed: ', intr[ids])

  return intr[ids]


def rf_seed_select(start_point, end_point, points, X, y):
  rfr=[]
  intr=[]
  for i in np.random.randint(start_point,end_point, size=points):
    intr.append(i)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state= i)
    rf= RandomForestRegressor(min_samples_split=0.05, n_jobs= -1)
    rf.fit(X_train,y_train)
    rf_p= rf.predict(X_val)
    rf_rmse= mean_squared_error(y_val, rf_p, squared=False)
    rfr.append(rf_rmse)

  idr= np.argmin(np.array(rfr))
  print('best svr results: ', rfr[idr])
  print('best seed: ', intr[idr])

  return intr[ids]

def xgboost_seed_select(start_point, end_point, points, X, y):
  xgbr=[]
  intr=[]
  for i in np.random.randint(start_point,end_point, size=points):
    intr.append(i)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state= i)
    reg = xgb.XGBRegressor()
    reg.fit(X_train,y_train)
    xgb_p= reg.predict(X_val)
    xgb_rmse= mean_squared_error(y_val, xgb_p, squared=False)
    xgbr.append(xgb_rmse)

  idr= np.argmin(np.array(xgbr))
  print('best svr results: ', xgbr[idr])
  print('best seed: ', intr[idr])

  return intr[ids]




# 5.4.1 Optuna for hyperparameter optiminzation

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

clf1 = RandomForestRegressor()

param_distributions = {
    'criterion': optuna.distributions.CategoricalDistribution(choices=('squared_error','absolute_error','friedman_mse','poisson')),
    "n_estimators": optuna.distributions.IntDistribution(10, 100, step=5),
    "min_samples_split": optuna.distributions.IntDistribution(2, 10),
    "min_samples_leaf": optuna.distributions.IntDistribution(2, 10),
    'max_features': optuna.distributions.CategoricalDistribution(choices=('sqrt','log2'))

}

optuna_search = optuna.integration.OptunaSearchCV(
    clf1, param_distributions, cv=5, n_trials=200, random_state=42, enable_pruning=False, n_jobs=-1, timeout=150, verbose=1, scoring='r2'
)

optuna_search.fit(X, y)

print("Best trial:")
trial_rf = optuna_search.study_.best_trial

print("  Value: ", trial_rf.value)
print("  Params: ")
for key, value in trial_rf.params.items():
    print("    {}: {}".format(key, value))

best_rf_para= trial_rf.params


In [None]:
import optuna
import numpy as np
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


#X = np.array(ecfp6)
#y = df2['half_life']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

clf = SVR()
param = {
    'kernel': optuna.distributions.CategoricalDistribution(choices=('poly','sigmoid','rbf')),
    "C": optuna.distributions.FloatDistribution(1e0, 1e3, log=True),
    'epsilon': optuna.distributions.FloatDistribution(1e-3, 1e1, log=True),
    'degree': optuna.distributions.IntDistribution(3, 6),
    'gamma': optuna.distributions.CategoricalDistribution(choices=(1, 0.1, 0.01, 0.001, 0.0001)),
    'coef0': optuna.distributions.FloatDistribution(1e-2, 1e0, log=True),

      }


#print(param)
optuna_search = optuna.integration.OptunaSearchCV(
    clf, param, cv=5, n_trials=200, random_state=42, enable_pruning=False, n_jobs=-1, timeout=100, verbose=1, scoring='r2'
)

optuna_search.fit(X, y)

print("Best trial:")
trial_svr = optuna_search.study_.best_trial

print("  Value: ", trial_svr.value)
print("  Params: ")
for key, value in trial_svr.params.items():
    print("    {}: {}".format(key, value))

best_svr_para= trial_svr.params

In [None]:
import numpy as np
import xgboost as xgb
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

def objective(trial):

  dtrain = xgb.DMatrix(X_train, label=y_train)

  param = {
      "verbosity": 0,
      "objective": "reg:squarederror",
      "eval_metric": "rmse",
      "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
      "lambda": trial.suggest_float("lambda", 1e-5, 1.0, log=True),
      "alpha": trial.suggest_float("alpha", 1e-5, 1.0, log=True),
  }

  if param["booster"] == "gbtree" or param["booster"] == "dart":
      param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
      param["eta"] = trial.suggest_float("eta", 1e-5, 1.0, log=True)
      param["gamma"] = trial.suggest_float("gamma", 1e-5, 1.0, log=True)
      param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
  if param["booster"] == "dart":
      param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
      param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
      param["rate_drop"] = trial.suggest_float("rate_drop", 1e-5, 1.0, log=True)
      param["skip_drop"] = trial.suggest_float("skip_drop", 1e-5, 1.0, log=True)


  pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")
  history = xgb.cv(param, dtrain, nfold=5, metrics='rmse', num_boost_round=10, seed=1348, callbacks=[pruning_callback])
  n_rmse = history["test-rmse-mean"].values[-1]

  return n_rmse


pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(pruner=pruner, direction="minimize")
study.optimize(objective, n_trials=200)
print('Best hyperparameters:', study.best_params)
best_xgb_para= study.best_params
print('Best RMSE:', study.best_value)

# 5.4.2 Gridsearch in case svm and RF running too long

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

def ML_RF(X_train, y_train, X_val, y_val):
  param_grid_rf = {'n_estimators': [100,200,300,400],'min_samples_split':[0.05],'min_samples_leaf':[i for i in range(1,10)], 'max_features':[1,2,3,4,5], 'max_samples':[0.7,0.8,0.9], 'n_jobs':[-1] }
  grid_rf = GridSearchCV(RandomForestRegressor(),param_grid_rf, n_jobs=-1, refit=True, verbose=3, scoring='r2')
  grid_rf.fit(X_train,y_train)
  print(grid_rf.best_params_)
  print(grid_rf.best_score_)
  grid_predictions_rf = grid_rf.predict(X_val)
  print(mean_squared_error(y_val, grid_predictions_rf, squared=False))
  rmse_rf= mean_squared_error(y_val, grid_predictions_rf, squared=False)

  return grid_rf.best_score_, rmse_rf, grid_rf.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

def ML_SVR(X_train, y_train, X_val, y_val):
  param_grid = {'C': [1,10,100,250,500], 'gamma': [0,0.1,0.05,0.01,0.005,0.001], 'coef0':[0,0.1,0.05,0.01,0.005,0.001], 'epsilon':[0.1,0.01,0.001], 'kernel': ['rbf', 'poly', 'sigmoid'] }
  grid = GridSearchCV(SVR(),param_grid, n_jobs=-1, refit=True, verbose=3, scoring='r2')
  grid.fit(X_train,y_train)
  print(grid.best_params_)
  print(grid.best_score_)
  grid_predictions = grid.predict(X_val)
  print(mean_squared_error(y_val, grid_predictions, squared=False))
  rmse= mean_squared_error(y_val, grid_predictions, squared=False)

  return grid.best_score_, rmse, grid.best_params_

# 5.5 Preparing dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


df= pd.read_csv('/content/eli_duplcate_hf.csv')
fp= pd.read_csv('/content/PubChem2.csv')

pubchemfp=cv2arr(fp)
X=np.array(pubchemfp)
y = df['half_life']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state= 112)

key= df['smiles']
smi_dic= dict(zip(key, X))

df2= pd.read_csv('/content/Qrm_MW_TPSA.csv')
df3= pd.read_csv('/content/chem_group_ana.csv')

rmn3_smi= df3['smiles'][df3['n3_ring']==0]
rmn3_fp= np.array([smi_dic[i] for i in rmn3_smi])
rmn3_y= df3['half_life'][df3['n3_ring']==0]

rmn7_smi= df3['smiles'][df3['n7_ring']==0]
rmn7_fp= np.array([smi_dic[i] for i in rmn7_smi])
rmn7_y= df3['half_life'][df3['n7_ring']==0]

df4= pd.read_csv('/content/drop_by_q1q3.csv')
q1q3_fp= np.array([smi_dic[i] for i in df4['smiles']])
q1q3_y= df4['half_life']

rmn3_train, rmn3_val, rmn3_y_train, rmn3_y_val = train_test_split(rmn3_fp, rmn3_y, test_size=0.1, random_state=112)
rmn7_train, rmn7_val, rmn7_y_train, rmn7_y_val = train_test_split(rmn7_fp, rmn7_y, test_size=0.1, random_state=112)
q1q3_train, q1q3_val, q1q3_y_train, q1q3_y_val = train_test_split(q1q3_fp, q1q3_y, test_size=0.1, random_state=112)


df5= pd.read_csv('/content/Qrm_MW_TPSA.csv')
MP_smi= df5['SMILES']
MP_fp= np.array([smi_dic[i] for i in MP_smi])
MP_y= df5['STANDARD_VALUE_LN']
MP_train, MP_val, MP_y_train, MP_y_val = train_test_split(MP_fp, MP_y, test_size=0.1, random_state=112)

df6= pd.read_csv('/content/rm_mol.csv')
n3n7_smi= df6['smiles'][df6['n3n7']!=1]
n3n7_fp= np.array([smi_dic[i] for i in n3n7_smi])
n3n7_y= df6['half_life'][df6['n3n7']!=1]
n3n7_train, n3n7_val, n3n7_y_train, n3n7_y_val = train_test_split(n3n7_fp, n3n7_y, test_size=0.1, random_state=112)

# 5.6 ML training and prediction

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

def ML_rf(best_rf_para, X_train, y_train, X_val, y_val):
  rf= RandomForestRegressor(**best_rf_para)
  #rf= RandomForestRegressor(min_samples_split=0.05, n_jobs= -1)
  rf.fit(X_train,y_train)
  rf_p= rf.predict(X_val)

  print(r2_score(y_val, rf_p))
  print(mean_squared_error(y_val, rf_p, squared=False))
  rmse_sc= mean_squared_error(y_val, rf_p, squared=False)
  r2_sc= r2_score(y_val, rf_p)

  return rmse_sc, r2_sc

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

def ML_svm(best_svr_para, X_train, y_train, X_val, y_val):
  svm= SVR(**best_svr_para)
  #svm= SVR(max_iter=-1)
  svm.fit(X_train,y_train)
  svm_p= svm.predict(X_val)

  print(r2_score(y_val, svm_p))
  print(mean_squared_error(y_val, svm_p, squared=False))
  rmse_sc= mean_squared_error(y_val, svm_p, squared=False)
  r2_sc= r2_score(y_val, svm_p)

  return rmse_sc, r2_sc

In [None]:
import xgboost as xgb
def ML_xgb(X_train, y_train, X_val, y_val):
  params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": 0.008759123511754415,
        "max_depth": 10,
        "subsample": 0.7630443989856173,
        "colsample_bytree": 0.6725557270413615,
        "min_child_weight": 4,
    }

  model = xgb.XGBRegressor(**params)
  model.fit(X_train, y_train, verbose=False)
  predictions = model.predict(X_val)
  rmse_sc = mean_squared_error(y_val, predictions, squared=False)
  print(rmse_sc)
  r2_sc = r2_score(y_val, predictions)
  print(r2_sc)

  return rmse_sc, r2_sc

In [None]:
# Here is the demonstration to perform ML analysis, use xgboost as example, rf and svm vise versa
rmse_sc, r2_sc= ML_xgb(X_train, y_train, X_val, y_val)
rmse_scn3, r2_sn3= ML_xgb(rmn3_train, rmn3_y_train, rmn3_val, rmn3_y_val)
rmse_scn7, r2n7_s= ML_xgb(rmn7_train, rmn7_y_train, rmn7_val, rmn7_y_val)
rmse_scq1q3, r2_sq1q3= ML_xgb(q1q3_train, q1q3_y_train, q1q3_val, q1q3_y_val)
print('n3n7')
ML_xgb(n3n7_train, n3n7_y_train, n3n7_val, n3n7_y_val)
print('MW_TPSA')
ML_xgb(MP_train, MP_y_train, MP_val, MP_y_val)