In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, average_precision_score
from modules.utils import load_cv
from modules.eval_funcs import cv_fold_eval
from modules.cleaning import rr_prefix

rand_seed = 42
moa_dict = {'PI3K' : 0, 'p38 MAPK': 1, 'RAF': 2, 'AURK': 3, 'CDK': 4, 'EGFR': 5, 'ROCK': 6,
             'MEK': 7, 'GSK': 8, 'mTOR': 9}

# Load IBP Data:

In [2]:
ki_ibp = pd.read_csv('data/ibp/ki_ibp.csv')
print(ki_ibp.shape)
ki_ibp.head(2)

(635, 4778)


Unnamed: 0,Metadata_Source,Metadata_Plate,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,Cells_AreaShape_BoundingBoxMinimum_X,Cells_AreaShape_BoundingBoxMinimum_Y,Cells_AreaShape_Center_X,...,smiles,clinical_phase,moa_src,Metadata_JCP2022,Metadata_InChIKey,Metadata_PlateType,blur_score,sat_score,focus_score,comp_score
0,source_3,JCPQC023,G14,3227.817708,5310.328125,589.21875,541.552083,519.484375,471.942708,553.446757,...,Nc1cc(c(cn1)-c1cc(nc(n1)N1CCOCC1)N1CCOCC1)C(F)...,Phase 3,dr_hub,JCP2022_013856,CWHUFRVAEUJCEF-UHFFFAOYSA-N,TARGET2,0.430742,0.453621,0.517562,1.401925
1,source_4,BR00121424,G14,4255.3,7338.3,572.7,554.5,488.42,470.49,530.26,...,Nc1cc(c(cn1)-c1cc(nc(n1)N1CCOCC1)N1CCOCC1)C(F)...,Phase 3,dr_hub,JCP2022_013856,CWHUFRVAEUJCEF-UHFFFAOYSA-N,TARGET2,0.436727,0.144924,0.386009,0.967661


# Load CV Fold Data:

In [4]:
# Load CV split data:
cv_splits = 5
cv_path = 'data/cv_val_split/'
cv_data = load_cv(cv_path, cv_splits, ki_ibp, moa_dict, norm=None)

In [4]:
# Load Spherized and Harmonized Data:
pycy_spher = 'output/MAD_Sphere/MADS_PyCyFS.pkl'
shap_spher = 'output/MAD_Sphere/MADS_ShapFS.pkl'
pycy_harm = 'output/MAD_Harmony/MAD_Harmony_PyCyFS.pkl'
shap_harm = 'output/MAD_Harmony/MAD_Harmony_ShapFS.pkl'

with open(shap_spher, 'rb') as file:
    loaded_data = pickle.load(file)
    
# Specify the prefix to remove
prefix_to_remove = 'sph_'
loaded_data = rr_prefix(loaded_data, prefix_to_remove)

In [8]:
shap_spher = 'output/Corrected/MADS_ShapFS.pkl'

with open(shap_spher, 'rb') as file:
    loaded_data = pickle.load(file)

    # Specify the prefix to remove
prefix_to_remove = 'sph_'
loaded_data = rr_prefix(loaded_data, prefix_to_remove)

In [8]:
loaded_data[0].keys()

dict_keys(['X_train', 'y_train', 'train_meta', 'X_test', 'y_test', 'test_meta'])

# Load Feature Selection Data:

In [7]:
cv_results = pd.read_csv('output/feat_sel/shap_fs_res.csv')

# Random Forest:

## Baseline Model:

In [4]:
rf_clf = RandomForestClassifier(random_state=rand_seed)

### Minmax Norm, no Feature Selection:

In [5]:
cv_fold_eval(rf_clf, cv_data)

Well-level Results: 52.67% Accuracy | 47.40% F1 Score
Compound-level Results: 70.00% Accuracy | 56.90% F1 Score
Well-level Results: 32.12% Accuracy | 30.61% F1 Score
Compound-level Results: 36.84% Accuracy | 29.19% F1 Score
Well-level Results: 45.13% Accuracy | 44.56% F1 Score
Compound-level Results: 36.84% Accuracy | 37.67% F1 Score
Well-level Results: 38.89% Accuracy | 38.33% F1 Score
Compound-level Results: 42.11% Accuracy | 40.83% F1 Score
Well-level Results: 28.91% Accuracy | 25.74% F1 Score
Compound-level Results: 26.32% Accuracy | 22.86% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 39.54% Accuracy (+/- 8.63%) | 37.33% F1 Score (+/- 8.18%) 
Compound-Level: 42.42% Accuracy (+/- 14.71%) | 37.49% F1 Score (+/- 11.59%) 


### MADS + Shapley FS:

In [9]:
cv_fold_eval(rf_clf, loaded_data)

Well-level Results: 61.74% Accuracy | 59.62% F1 Score
Compound-level Results: 80.00% Accuracy | 79.00% F1 Score
Well-level Results: 39.83% Accuracy | 38.66% F1 Score
Compound-level Results: 50.00% Accuracy | 38.00% F1 Score
Well-level Results: 48.51% Accuracy | 52.24% F1 Score
Compound-level Results: 61.11% Accuracy | 59.83% F1 Score
Well-level Results: 48.28% Accuracy | 42.31% F1 Score
Compound-level Results: 50.00% Accuracy | 41.05% F1 Score
Well-level Results: 41.27% Accuracy | 39.16% F1 Score
Compound-level Results: 42.11% Accuracy | 35.67% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 47.93% Accuracy (+/- 7.76%) | 46.40% F1 Score (+/- 8.22%) 
Compound-Level: 56.64% Accuracy (+/- 13.15%) | 50.71% F1 Score (+/- 16.52%) 


### MADS + Pycy FS:

In [11]:
cv_fold_eval(rf_clf, loaded_data)

Well-level Results: 57.39% Accuracy | 51.95% F1 Score
Compound-level Results: 80.00% Accuracy | 77.05% F1 Score
Well-level Results: 34.75% Accuracy | 32.42% F1 Score
Compound-level Results: 38.89% Accuracy | 31.50% F1 Score
Well-level Results: 49.50% Accuracy | 51.53% F1 Score
Compound-level Results: 61.11% Accuracy | 61.50% F1 Score
Well-level Results: 40.52% Accuracy | 34.11% F1 Score
Compound-level Results: 55.56% Accuracy | 44.38% F1 Score
Well-level Results: 34.13% Accuracy | 29.27% F1 Score
Compound-level Results: 42.11% Accuracy | 33.10% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 43.26% Accuracy (+/- 8.97%) | 39.86% F1 Score (+/- 9.83%) 
Compound-Level: 55.53% Accuracy (+/- 14.75%) | 49.50% F1 Score (+/- 17.45%) 


## MADH + Shap FS:

In [13]:
cv_fold_eval(rf_clf, loaded_data)

Well-level Results: 57.39% Accuracy | 58.37% F1 Score
Compound-level Results: 70.00% Accuracy | 65.71% F1 Score
Well-level Results: 37.29% Accuracy | 34.49% F1 Score
Compound-level Results: 50.00% Accuracy | 45.86% F1 Score
Well-level Results: 51.49% Accuracy | 54.70% F1 Score
Compound-level Results: 55.56% Accuracy | 51.38% F1 Score
Well-level Results: 47.41% Accuracy | 46.75% F1 Score
Compound-level Results: 55.56% Accuracy | 51.19% F1 Score
Well-level Results: 28.57% Accuracy | 25.96% F1 Score
Compound-level Results: 36.84% Accuracy | 29.05% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 44.43% Accuracy (+/- 10.29%) | 44.05% F1 Score (+/- 12.20%) 
Compound-Level: 53.59% Accuracy (+/- 10.68%) | 48.64% F1 Score (+/- 11.81%) 


### MADH + PyCy FS

In [15]:
cv_fold_eval(rf_clf, loaded_data)

Well-level Results: 43.48% Accuracy | 39.79% F1 Score
Compound-level Results: 50.00% Accuracy | 38.64% F1 Score
Well-level Results: 32.20% Accuracy | 22.69% F1 Score
Compound-level Results: 44.44% Accuracy | 31.08% F1 Score
Well-level Results: 40.59% Accuracy | 36.49% F1 Score
Compound-level Results: 55.56% Accuracy | 51.02% F1 Score
Well-level Results: 31.03% Accuracy | 30.57% F1 Score
Compound-level Results: 38.89% Accuracy | 31.27% F1 Score
Well-level Results: 24.60% Accuracy | 19.65% F1 Score
Compound-level Results: 36.84% Accuracy | 27.50% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 34.38% Accuracy (+/- 6.83%) | 29.84% F1 Score (+/- 7.73%) 
Compound-Level: 45.15% Accuracy (+/- 6.94%) | 35.90% F1 Score (+/- 8.38%) 


## Random Search:

In [24]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 16, stop = 1024, num = 6)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 64, num = 8)]
# Minimum number of samples required to split a node
min_samples_split = [2, 6, 8, 12]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [25]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [16, 217, 419, 620, 822, 1024], 'max_depth': [2, 10, 19, 28, 37, 46, 55, 64], 'min_samples_split': [2, 6, 8, 12], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [26]:
for cv in range(cv_splits):
    # Define model and search:
    rand_search_rf = RandomForestClassifier(random_state = rand_seed) 
    
    # Search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rand_search_rf, param_distributions = random_grid, 
                               n_iter = 25, verbose=1, random_state=rand_seed, n_jobs = -1)
    
    # Fit Random Search Model:
    rf_random.fit(loaded_data[cv]['X_train'], loaded_data[cv]['y_train'])
    
    print("Best CV Accuracy: ", rf_random.best_score_)
    print("Best Parameters: ", rf_random.best_params_)
    print("--------------------------------------------")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV Accuracy:  0.6376577840112201
Best Parameters:  {'n_estimators': 822, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': 19, 'bootstrap': False}
--------------------------------------------
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV Accuracy:  0.6546344959388437
Best Parameters:  {'n_estimators': 620, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 46, 'bootstrap': False}
--------------------------------------------
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV Accuracy:  0.6778947368421052
Best Parameters:  {'n_estimators': 822, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 28, 'bootstrap': False}
--------------------------------------------
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV Accuracy:  0.6434782608695653
Best Parameters:  {'n_estimators': 822, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_d

### Re-Fit Model with Hyp. Tuned Params:

In [7]:
hyp_rf_clf = RandomForestClassifier(n_estimators=474, min_samples_split=2,
                                    min_samples_leaf=1, max_depth=55, bootstrap=False,
                                    random_state=rand_seed)

### Minmax + No FS:

In [8]:
cv_fold_eval(hyp_rf_clf, cv_data)

Well-level Results: 55.73% Accuracy | 49.11% F1 Score
Compound-level Results: 70.00% Accuracy | 56.90% F1 Score
Well-level Results: 32.85% Accuracy | 29.45% F1 Score
Compound-level Results: 52.63% Accuracy | 37.38% F1 Score
Well-level Results: 45.13% Accuracy | 46.24% F1 Score
Compound-level Results: 36.84% Accuracy | 37.12% F1 Score
Well-level Results: 38.89% Accuracy | 38.53% F1 Score
Compound-level Results: 57.89% Accuracy | 53.71% F1 Score
Well-level Results: 29.69% Accuracy | 28.72% F1 Score
Compound-level Results: 26.32% Accuracy | 24.05% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 40.46% Accuracy (+/- 9.29%) | 38.41% F1 Score (+/- 8.37%) 
Compound-Level: 48.74% Accuracy (+/- 15.46%) | 41.83% F1 Score (+/- 12.06%) 


### MADS + Shap FS:

In [27]:
hyp_rf_clf = RandomForestClassifier(n_estimators=822, min_samples_split=2,
                                    min_samples_leaf=1, max_depth=28, bootstrap=False,
                                    random_state=rand_seed)

In [28]:
cv_fold_eval(hyp_rf_clf, loaded_data)

Well-level Results: 66.96% Accuracy | 65.65% F1 Score
Compound-level Results: 85.00% Accuracy | 84.00% F1 Score
Well-level Results: 40.68% Accuracy | 38.74% F1 Score
Compound-level Results: 61.11% Accuracy | 58.67% F1 Score
Well-level Results: 50.50% Accuracy | 54.21% F1 Score
Compound-level Results: 55.56% Accuracy | 51.50% F1 Score
Well-level Results: 47.41% Accuracy | 44.79% F1 Score
Compound-level Results: 50.00% Accuracy | 41.05% F1 Score
Well-level Results: 42.86% Accuracy | 40.27% F1 Score
Compound-level Results: 36.84% Accuracy | 32.86% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 49.68% Accuracy (+/- 9.29%) | 48.73% F1 Score (+/- 10.04%) 
Compound-Level: 57.70% Accuracy (+/- 15.85%) | 53.61% F1 Score (+/- 17.56%) 


## Apply Feature Selection:

In [40]:
n_feats = 150
top_feats = cv_results['features'][0:n_feats].tolist()
cv_fold_eval(hyp_rf_clf, cv_data, top_feats)

Well-level Results: 58.02% Accuracy | 55.10% F1 Score
Compound-level Results: 75.00% Accuracy | 68.33% F1 Score
Well-level Results: 38.69% Accuracy | 37.48% F1 Score
Compound-level Results: 47.37% Accuracy | 43.86% F1 Score
Well-level Results: 46.02% Accuracy | 47.81% F1 Score
Compound-level Results: 47.37% Accuracy | 46.02% F1 Score
Well-level Results: 49.21% Accuracy | 46.55% F1 Score
Compound-level Results: 68.42% Accuracy | 60.50% F1 Score
Well-level Results: 34.38% Accuracy | 29.66% F1 Score
Compound-level Results: 36.84% Accuracy | 32.50% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 45.26% Accuracy (+/- 8.25%) | 43.32% F1 Score (+/- 8.83%) 
Compound-Level: 55.00% Accuracy (+/- 14.33%) | 50.24% F1 Score (+/- 12.70%) 


# XGBoost:

## Baseline:

In [9]:
xgb_model = XGBClassifier()

### Minmax and no FS:

In [10]:
cv_fold_eval(xgb_model, cv_data)

Well-level Results: 55.73% Accuracy | 53.03% F1 Score
Compound-level Results: 60.00% Accuracy | 48.90% F1 Score
Well-level Results: 36.50% Accuracy | 32.90% F1 Score
Compound-level Results: 42.11% Accuracy | 32.67% F1 Score
Well-level Results: 46.02% Accuracy | 47.07% F1 Score
Compound-level Results: 52.63% Accuracy | 47.17% F1 Score
Well-level Results: 44.44% Accuracy | 42.82% F1 Score
Compound-level Results: 68.42% Accuracy | 60.38% F1 Score
Well-level Results: 34.38% Accuracy | 32.43% F1 Score
Compound-level Results: 42.11% Accuracy | 35.71% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 43.41% Accuracy (+/- 7.60%) | 41.65% F1 Score (+/- 8.02%) 
Compound-Level: 53.05% Accuracy (+/- 10.24%) | 44.97% F1 Score (+/- 9.95%) 


## Random Search:

In [32]:
rs_params = {'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': np.arange(50, 1000, 200)}

In [12]:
rs_params

{'max_depth': [3, 5, 6, 10, 15, 20],
 'learning_rate': [0.01, 0.1, 0.2, 0.3],
 'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9]),
 'colsample_bytree': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
 'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
 'n_estimators': array([ 50, 250, 450, 650, 850])}

In [34]:
for cv in range(cv_splits):
    # Define model and search:
    rand_search_XGB = XGBClassifier(random_state=rand_seed) 
    
    # Search across 100 different combinations, and use all available cores
    xgb_random = RandomizedSearchCV(estimator=rand_search_XGB, param_distributions=rs_params, 
                               n_iter=50, verbose=1, random_state=rand_seed)
    
    # Fit Random Search Model:
    xgb_random.fit(loaded_data[cv]['X_train'], loaded_data[cv]['y_train'])
    
    print("Best CV Accuracy: ", xgb_random.best_score_)
    print("Best Parameters: ", xgb_random.best_params_)
    print("--------------------------------------------")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.652805049088359
Best Parameters:  {'subsample': 0.6, 'n_estimators': 650, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.7}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.6415671285236503
Best Parameters:  {'subsample': 0.6, 'n_estimators': 50, 'max_depth': 15, 'learning_rate': 0.1, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.5}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.68
Best Parameters:  {'subsample': 0.5, 'n_estimators': 450, 'max_depth': 15, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.4}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.6260869565217392
Best Parameters:  {'subsample': 0.5, '

### Re-Fit Model with Hyp. Tuned Params:

In [6]:
xgb_model = XGBClassifier(n_estimators = 400, subsample=0.5, colsample_bytree=0.8, learning_rate = 0.05, 
                          max_depth=15, colsample_bylevel=0.4)

In [36]:
cv_fold_eval(xgb_model, cv_data)

Well-level Results: 56.49% Accuracy | 54.62% F1 Score
Compound-level Results: 70.00% Accuracy | 56.90% F1 Score
Well-level Results: 40.15% Accuracy | 36.12% F1 Score
Compound-level Results: 42.11% Accuracy | 32.11% F1 Score
Well-level Results: 46.02% Accuracy | 45.59% F1 Score
Compound-level Results: 47.37% Accuracy | 46.67% F1 Score
Well-level Results: 41.27% Accuracy | 38.66% F1 Score
Compound-level Results: 63.16% Accuracy | 56.00% F1 Score
Well-level Results: 36.72% Accuracy | 36.75% F1 Score
Compound-level Results: 42.11% Accuracy | 44.00% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 44.13% Accuracy (+/- 6.86%) | 42.35% F1 Score (+/- 7.00%) 
Compound-Level: 52.95% Accuracy (+/- 11.50%) | 47.14% F1 Score (+/- 9.05%) 


### MADS + Shap:

In [7]:
fold_dict = cv_fold_eval(xgb_model, loaded_data)

Well-level Results: 65.22% Accuracy | 63.95% F1 Score
Compound-level Results: 85.00% Accuracy | 84.00% F1 Score
Well-level Results: 45.76% Accuracy | 44.20% F1 Score
Compound-level Results: 55.56% Accuracy | 47.05% F1 Score
Well-level Results: 49.50% Accuracy | 53.51% F1 Score
Compound-level Results: 77.78% Accuracy | 80.00% F1 Score
Well-level Results: 48.28% Accuracy | 46.76% F1 Score
Compound-level Results: 61.11% Accuracy | 54.67% F1 Score
Well-level Results: 44.44% Accuracy | 40.12% F1 Score
Compound-level Results: 47.37% Accuracy | 39.71% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 50.64% Accuracy (+/- 7.50%) | 49.71% F1 Score (+/- 8.35%) 
Compound-Level: 65.36% Accuracy (+/- 13.98%) | 61.09% F1 Score (+/- 17.76%) 


- 96 compounds:

In [9]:
fold_dict = cv_fold_eval(xgb_model, loaded_data)

Well-level Results: 61.83% Accuracy | 61.36% F1 Score
Compound-level Results: 80.00% Accuracy | 76.33% F1 Score
Well-level Results: 45.99% Accuracy | 40.61% F1 Score
Compound-level Results: 52.63% Accuracy | 50.33% F1 Score
Well-level Results: 51.33% Accuracy | 52.32% F1 Score
Compound-level Results: 63.16% Accuracy | 54.88% F1 Score
Well-level Results: 49.21% Accuracy | 47.18% F1 Score
Compound-level Results: 57.89% Accuracy | 54.57% F1 Score
Well-level Results: 41.41% Accuracy | 37.70% F1 Score
Compound-level Results: 42.11% Accuracy | 33.90% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 49.95% Accuracy (+/- 6.82%) | 47.83% F1 Score (+/- 8.47%) 
Compound-Level: 59.16% Accuracy (+/- 12.53%) | 54.00% F1 Score (+/- 13.54%) 


- Above shows crude statistics calculated as mean values across folds, below are the precise metrics, calculated when all predictions across test sets are combined before metrics are calculated:

In [25]:
# Concatenate the arrays within the results lists:
pred_arr = fold_dict['pred_arr']
act_arr = fold_dict['act_arr']
proba_arr = np.vstack(fold_dict['proba_arr'])

# Calculate performance metrics at a compound-level:
acc = accuracy_score(act_arr, pred_arr)
f1 = f1_score(act_arr, pred_arr, average='macro')
precision = precision_score(act_arr, pred_arr, average='macro', zero_division=0)
recall = recall_score(act_arr, pred_arr, average='macro')
roc_auc = roc_auc_score(act_arr, proba_arr, average='macro', multi_class='ovr')

# Calculate AUPR for each class
aupr_scores = [average_precision_score(act_arr == class_index, proba_arr[:, class_index]
                                       ) for class_index in range(proba_arr.shape[1])]
mean_aupr = np.mean(aupr_scores)

# Print metrics:
print('Accuracy: {:.2f}%'.format(acc*100))
print('F1 Score: {:.2f}%'.format(f1*100))
print('Precision: {:.2f}%'.format(precision * 100))
print('Recall: {:.2f}%'.format(recall * 100))
print('ROC AUC: {:.2f}%'.format(roc_auc * 100))
print('AUPR: {:.2f}%'.format(mean_aupr * 100))

Accuracy: 65.59%
F1 Score: 63.99%
Precision: 66.12%
Recall: 64.32%
ROC AUC: 89.02%
AUPR: 68.61%


### MADS + Pycy:

In [6]:
cv_fold_eval(xgb_model, loaded_data)

Well-level Results: 67.83% Accuracy | 67.63% F1 Score
Compound-level Results: 85.00% Accuracy | 84.67% F1 Score
Well-level Results: 44.07% Accuracy | 42.60% F1 Score
Compound-level Results: 55.56% Accuracy | 53.00% F1 Score
Well-level Results: 54.46% Accuracy | 57.03% F1 Score
Compound-level Results: 66.67% Accuracy | 70.55% F1 Score
Well-level Results: 45.69% Accuracy | 45.23% F1 Score
Compound-level Results: 66.67% Accuracy | 60.05% F1 Score
Well-level Results: 38.89% Accuracy | 33.37% F1 Score
Compound-level Results: 47.37% Accuracy | 35.78% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 50.19% Accuracy (+/- 10.15%) | 49.17% F1 Score (+/- 11.92%) 
Compound-Level: 64.25% Accuracy (+/- 12.67%) | 60.81% F1 Score (+/- 16.44%) 


### MADH + Shap:

In [17]:
xgb_model = XGBClassifier(n_estimators = 650, subsample=0.7, colsample_bytree=0.8, 
                          learning_rate = 0.1, max_depth=15, colsample_bylevel=0.7)

In [10]:
cv_fold_eval(xgb_model, loaded_data)

Well-level Results: 68.70% Accuracy | 68.36% F1 Score
Compound-level Results: 75.00% Accuracy | 71.33% F1 Score
Well-level Results: 40.68% Accuracy | 37.69% F1 Score
Compound-level Results: 50.00% Accuracy | 43.00% F1 Score
Well-level Results: 50.50% Accuracy | 51.91% F1 Score
Compound-level Results: 50.00% Accuracy | 50.00% F1 Score
Well-level Results: 48.28% Accuracy | 45.43% F1 Score
Compound-level Results: 66.67% Accuracy | 59.00% F1 Score
Well-level Results: 37.30% Accuracy | 31.54% F1 Score
Compound-level Results: 36.84% Accuracy | 30.02% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 49.09% Accuracy (+/- 10.92%) | 46.99% F1 Score (+/- 12.72%) 
Compound-Level: 55.70% Accuracy (+/- 13.52%) | 50.67% F1 Score (+/- 14.01%) 


### MADH + Pycy:

In [18]:
cv_fold_eval(xgb_model, loaded_data)

Well-level Results: 60.87% Accuracy | 59.27% F1 Score
Compound-level Results: 80.00% Accuracy | 76.67% F1 Score
Well-level Results: 38.14% Accuracy | 36.98% F1 Score
Compound-level Results: 55.56% Accuracy | 53.67% F1 Score
Well-level Results: 51.49% Accuracy | 54.58% F1 Score
Compound-level Results: 72.22% Accuracy | 74.29% F1 Score
Well-level Results: 43.97% Accuracy | 45.34% F1 Score
Compound-level Results: 55.56% Accuracy | 51.67% F1 Score
Well-level Results: 38.10% Accuracy | 35.42% F1 Score
Compound-level Results: 42.11% Accuracy | 39.11% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 46.51% Accuracy (+/- 8.70%) | 46.32% F1 Score (+/- 9.41%) 
Compound-Level: 61.09% Accuracy (+/- 13.44%) | 59.08% F1 Score (+/- 14.31%) 


# SVM:

## Baseline:

In [11]:
svm_clf = svm.SVC(random_state = rand_seed, probability=True)

In [12]:
cv_fold_eval(svm_clf, cv_data)

Well-level Results: 24.43% Accuracy | 14.74% F1 Score
Compound-level Results: 50.00% Accuracy | 43.67% F1 Score
Well-level Results: 24.09% Accuracy | 14.20% F1 Score
Compound-level Results: 36.84% Accuracy | 20.44% F1 Score
Well-level Results: 23.01% Accuracy | 22.83% F1 Score
Compound-level Results: 42.11% Accuracy | 39.52% F1 Score
Well-level Results: 12.70% Accuracy | 7.02% F1 Score
Compound-level Results: 15.79% Accuracy | 14.44% F1 Score
Well-level Results: 16.41% Accuracy | 10.56% F1 Score
Compound-level Results: 21.05% Accuracy | 16.39% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 20.13% Accuracy (+/- 4.72%) | 13.87% F1 Score (+/- 5.27%) 
Compound-Level: 33.16% Accuracy (+/- 12.85%) | 26.89% F1 Score (+/- 12.23%) 


## Random Search:

In [9]:
rs_params = {'C': [0.1, 0.2, 0.5, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['poly', 'rbf', 'linear']} 

In [10]:
for cv in range(cv_splits):
    # Define model and search:
    svm_clf = svm.SVC(random_state = rand_seed, probability=True) 
    
    # Search across 100 different combinations, and use all available cores
    svm_random = RandomizedSearchCV(estimator=svm_clf, param_distributions=rs_params, 
                               n_iter=50, verbose=1, random_state=rand_seed)
    
    # Fit Random Search Model:
    svm_random.fit(loaded_data[cv]['X_train'], loaded_data[cv]['y_train'])
    
    print("Best CV Accuracy: ", svm_random.best_score_)
    print("Best Parameters: ", svm_random.best_params_)
    print("--------------------------------------------")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.5942964001870032
Best Parameters:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.5780936454849498
Best Parameters:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.5810526315789474
Best Parameters:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.5826086956521739
Best Parameters:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
--------------------------------------------
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV Accuracy:  0.6666666666666667
Best Parameters:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 100}
----------

In [11]:
svm_clf = svm.SVC(kernel = 'rbf', gamma = 0.001, C = 100, 
                  random_state = rand_seed, probability=True)
cv_fold_eval(svm_clf, loaded_data)

Well-level Results: 57.39% Accuracy | 55.64% F1 Score
Compound-level Results: 80.00% Accuracy | 76.00% F1 Score
Well-level Results: 33.90% Accuracy | 29.66% F1 Score
Compound-level Results: 44.44% Accuracy | 35.00% F1 Score
Well-level Results: 55.45% Accuracy | 55.57% F1 Score
Compound-level Results: 72.22% Accuracy | 67.33% F1 Score
Well-level Results: 40.52% Accuracy | 41.60% F1 Score
Compound-level Results: 50.00% Accuracy | 45.86% F1 Score
Well-level Results: 32.54% Accuracy | 29.74% F1 Score
Compound-level Results: 36.84% Accuracy | 31.86% F1 Score
---------- Cross-validated Mean Metrics ----------
Well-Level: 43.96% Accuracy (+/- 10.54%) | 42.44% F1 Score (+/- 11.59%) 
Compound-Level: 56.70% Accuracy (+/- 16.57%) | 51.21% F1 Score (+/- 17.55%) 


# Training Full XGBoost Model:

In [15]:
# Load full spherized data with Shapley FS:
with open('output/MAD_Sphere/MADS_FULL_ShapFS.pkl', 'rb') as file:
    full_sph_data = pickle.load(file)

In [16]:
xgb_model = XGBClassifier(n_estimators = 400, subsample=0.5, colsample_bytree=0.8, learning_rate = 0.05, 
                          max_depth=15, colsample_bylevel=0.4)

In [19]:
# Fit Model on full data:
xgb_model.fit(full_sph_data['sph_X_data'], full_sph_data['sph_y_data'])

In [21]:
with open("output/XGB/xgb_full_model.pkl", "wb") as model_file:
    pickle.dump(xgb_model, model_file)