In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression, ElasticNet, SGDClassifier, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, cross_validate, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


In [2]:
pheno = pd.read_csv("phen.csv", index_col=0)
# pheno.drop(["Unnamed: 0"], axis = 1, inplace=True) # I have no idea why that column even exists...
# pheno = pd.DataFrame(pheno) # why this isn't already in a data frame I don't understand...

geno = pd.read_csv("geno_mat.csv", index_col=0)
cci = pd.read_csv("cci_mat.csv", index_col=0)

In [3]:
seed = 0
X_train_geno, X_test_geno, y_train_geno, y_test_geno = train_test_split(geno, pheno, test_size=0.20, random_state=seed)
X_train_cci, X_test_cci, y_train_cci, y_test_cci = train_test_split(cci, pheno, test_size=0.20, random_state=seed)


# 7. Random Forest

## On Genotype Data

In [12]:
rf_pipeline = Pipeline(steps = [("pca", PCA(n_components=10)),
                                ("rf", RandomForestClassifier())])

In [9]:
rf_scores = cross_validate(rf_pipeline, geno, pheno["label"], cv=5, scoring=["accuracy",
                                                                             "roc_auc",
                                                                             "f1",
                                                                             "balanced_accuracy"])
rf_scores


{'fit_time': array([3.74657202, 4.03531289, 4.00947905, 4.12524533, 3.96632218]),
 'score_time': array([0.08276987, 0.05174184, 0.0525949 , 0.06188798, 0.06103778]),
 'test_accuracy': array([0.73584906, 0.67924528, 0.69811321, 0.72380952, 0.74285714]),
 'test_roc_auc': array([0.68289474, 0.73991228, 0.73048246, 0.73711111, 0.80555556]),
 'test_f1': array([0.8313253 , 0.79761905, 0.80487805, 0.82208589, 0.84210526]),
 'test_balanced_accuracy': array([0.60394737, 0.52412281, 0.56754386, 0.59666667, 0.58      ])}

In [11]:
print("ACC")
print("mean =", np.mean(rf_scores["test_accuracy"]))
print("std =", np.std(rf_scores["test_accuracy"]), "\n")
print("AUC")
print("mean =", np.mean(rf_scores["test_roc_auc"]))
print("std =", np.std(rf_scores["test_roc_auc"]), "\n")
print("F1")
print("mean =", np.mean(rf_scores["test_f1"]))
print("std =", np.std(rf_scores["test_f1"]), "\n")
print("BAS")
print("mean =", np.mean(rf_scores["test_balanced_accuracy"]))
print("std =", np.std(rf_scores["test_balanced_accuracy"]), "\n")

ACC
mean = 0.7159748427672956
std = 0.023862620278679646 

AUC
mean = 0.7391912280701755
std = 0.03912597193467675 

F1
mean = 0.8196027100665603
std = 0.016431964481874384 

BAS
mean = 0.5744561403508772
std = 0.028195738901702662 



In [6]:
grid_search_rf = GridSearchCV(estimator=rf_pipeline,
                              param_grid={"rf__n_estimators": [100, 150, 200, 250, 300],
                                          "rf__criterion": ["gini", "entropy", "log_loss"],
                                          "rf__max_depth": [None, 2, 4, 6, 8, 10]},
                              scoring="balanced_accuracy", # scoring can be "accuracy", "roc_auc", "f1"
                              return_train_score=True, # or else you'll never see it!
                              cv=KFold(n_splits=5, shuffle=True, random_state=0),
                              verbose = 3) # to get a consistent set of settings


In [7]:
grid_search_rf.fit(geno, pheno["label"])
grid_search_rf.best_params_
# PCA = 30

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.558) total time=   5.8s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.542) total time=   6.3s
[CV 3/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.620) total time=   6.4s
[CV 4/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.527) total time=   5.9s
[CV 5/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.664) total time=   6.0s
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.544) total time=   6.1s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.591) total time=   6.3s
[CV 3/5] END rf__criterion=gini, rf__max_depth=No

[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.729, test=0.524) total time=   4.6s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.662, test=0.544) total time=   4.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.693, test=0.554) total time=   6.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.733, test=0.542) total time=   6.0s
[CV 5/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.716, test=0.595) total time=   6.2s
[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.733, test=0.524) total time=   5.9s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.671, test=0.531) total time=   5.9s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.697, test=0.568) total time=   6.1s
[CV 4/5]

[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.996, test=0.593) total time=   4.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.992, test=0.549) total time=   4.4s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=1.000, test=0.649) total time=   4.3s
[CV 1/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.542) total time=   4.5s
[CV 2/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.573) total time=   4.5s
[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.996, test=0.593) total time=   4.8s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.579) total time=   4.4s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.650) total time=   4.5s


[CV 3/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.553, test=0.514) total time=   4.4s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.553, test=0.500) total time=   4.2s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.540, test=0.560) total time=   3.4s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.729, test=0.524) total time=   3.5s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.667, test=0.526) total time=   4.2s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.686, test=0.520) total time=   4.1s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.737, test=0.554) total time=   4.3s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.706, test=0.621) tot

[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.934, test=0.546) total time=   4.6s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.970, test=0.537) total time=   4.3s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.988, test=0.690) total time=   4.7s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.992, test=0.520) total time=   4.6s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.955, test=0.593) total time=   4.5s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.938, test=0.573) total time=   4.5s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.962, test=0.555) total time=   4.5s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.988, test=0.623) tot

[CV 2/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.518, test=0.500) total time=   4.3s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.553, test=0.514) total time=   4.0s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.560, test=0.500) total time=   4.1s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.544, test=0.574) total time=   4.1s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.581, test=0.516) total time=   4.3s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.532, test=0.500) total time=   4.4s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.553, test=0.514) total time=   4.3s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.556, test=0.

[CV 2/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.815, test=0.542) total time=   4.5s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.796, test=0.553) total time=   4.8s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.865, test=0.554) total time=   4.1s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.880, test=0.629) total time=   4.4s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.886, test=0.518) total time=   4.3s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.802, test=0.549) total time=   4.4s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.796, test=0.553) total time=   5.7s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.880, test=0.

{'rf__criterion': 'entropy', 'rf__max_depth': None, 'rf__n_estimators': 250}

## On CCI Data

In [25]:
rf_pipeline2 = Pipeline(steps = [("scale", StandardScaler()),
                                 ("pca", PCA(n_components=20)),
                                 ("rf", RandomForestClassifier())])

In [23]:
rf_scores2 = cross_validate(rf_pipeline2, cci, pheno["label"], cv=5, scoring=["accuracy",
                                                                             "roc_auc",
                                                                             "f1",
                                                                             "balanced_accuracy"])
rf_scores2


{'fit_time': array([6.03069782, 6.00560308, 6.98185396, 7.67937994, 6.78451896]),
 'score_time': array([0.04142594, 0.01317692, 0.02076125, 0.01310611, 0.01388001]),
 'test_accuracy': array([0.74528302, 0.77358491, 0.70754717, 0.75238095, 0.64761905]),
 'test_roc_auc': array([0.70131579, 0.81600877, 0.64692982, 0.69      , 0.55355556]),
 'test_f1': array([0.84393064, 0.85714286, 0.81871345, 0.84883721, 0.77844311]),
 'test_balanced_accuracy': array([0.58026316, 0.64035088, 0.54385965, 0.58666667, 0.48333333])}

In [24]:
print("ACC")
print("mean =", np.mean(rf_scores2["test_accuracy"]))
print("std =", np.std(rf_scores2["test_accuracy"]), "\n")
print("AUC")
print("mean =", np.mean(rf_scores2["test_roc_auc"]))
print("std =", np.std(rf_scores2["test_roc_auc"]), "\n")
print("F1")
print("mean =", np.mean(rf_scores2["test_f1"]))
print("std =", np.std(rf_scores2["test_f1"]), "\n")
print("BAS")
print("mean =", np.mean(rf_scores2["test_balanced_accuracy"]))
print("std =", np.std(rf_scores2["test_balanced_accuracy"]), "\n")

ACC
mean = 0.7252830188679245
std = 0.04430264786735082 

AUC
mean = 0.6815619883040935
std = 0.08499692220291108 

F1
mean = 0.829413453269637
std = 0.028528639292752872 

BAS
mean = 0.5668947368421053
std = 0.05192026939118701 



In [20]:
grid_search_rf2 = GridSearchCV(estimator=rf_pipeline2,
                              param_grid={"rf__n_estimators": [100, 150, 200, 250, 300],
                                          "rf__criterion": ["gini", "entropy", "log_loss"],
                                          "rf__max_depth": [None, 2, 4, 6, 8, 10]},
                              scoring="balanced_accuracy", # scoring can be "accuracy", "roc_auc", "f1"
                              return_train_score=True, # or else you'll never see it!
                              cv=KFold(n_splits=5, shuffle=True, random_state=0),
                              verbose = 3) # to get a consistent set of settings


In [21]:
grid_search_rf2.fit(cci, pheno["label"])
grid_search_rf2.best_params_
# PCA = 40 or 0

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.567) total time=   7.7s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.545) total time=   7.9s
[CV 3/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.585) total time=   8.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.607) total time=   8.5s
[CV 5/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.589) total time=   8.1s
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.558) total time=   8.5s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.532) total time=   8.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=No

[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.661, test=0.500) total time=   8.5s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.635, test=0.529) total time=   5.9s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.650, test=0.527) total time=   8.5s
[CV 4/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.688, test=0.542) total time=   7.2s
[CV 5/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.648, test=0.540) total time=   8.1s
[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.657, test=0.509) total time=   7.2s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.640, test=0.536) total time=   6.9s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.624, test=0.514) total time=   7.8s
[CV 4/5]

[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.973, test=0.580) total time=   8.8s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.974, test=0.543) total time=   8.1s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.980, test=0.621) total time=   7.6s
[CV 1/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.996, test=0.542) total time=   8.3s
[CV 2/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.977, test=0.547) total time=   8.9s
[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.965, test=0.573) total time=   7.9s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.977, test=0.543) total time=   8.1s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.980, test=0.601) total time=   8.2s


[CV 3/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.513, test=0.500) total time=   8.1s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.523, test=0.559) total time=   7.6s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.524, test=0.500) total time=   8.4s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.661, test=0.509) total time=   7.6s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.617, test=0.516) total time=   8.0s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.606, test=0.514) total time=   8.6s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.688, test=0.571) total time=   8.0s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.608, test=0.494) tot

[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.858, test=0.568) total time=   8.2s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.910, test=0.566) total time=   8.6s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.900, test=0.535) total time=   8.0s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.928, test=0.542) total time=   7.9s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.905, test=0.521) total time=   8.6s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.889, test=0.568) total time=   8.7s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.902, test=0.560) total time=   8.0s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.916, test=0.581) tot

[CV 2/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.514, test=0.500) total time=   7.1s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.509, test=0.500) total time=   8.1s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.519, test=0.559) total time=   7.7s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.524, test=0.500) total time=   8.0s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.530, test=0.500) total time=   8.3s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.523, test=0.526) total time=   8.1s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.513, test=0.500) total time=   8.3s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.534, test=0.

[CV 2/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.766, test=0.549) total time=   7.3s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.739, test=0.554) total time=   8.5s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.805, test=0.566) total time=   8.5s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.756, test=0.568) total time=   8.0s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.788, test=0.533) total time=   8.4s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.766, test=0.529) total time=   7.6s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.739, test=0.554) total time=   7.5s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.801, test=0.

{'rf__criterion': 'log_loss', 'rf__max_depth': None, 'rf__n_estimators': 100}

## Combined Data

In [4]:
std_scaler = StandardScaler()
pca10 = PCA(n_components=10)
pca20 = PCA(n_components=20)

### Best genotype data

In [5]:
# https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/

# transform data
# Genotype data was unscaled, PCA=10
rf_geno_pca = pca10.fit_transform(geno)
# type(geno) # pandas.core.frame.DataFrame
# type(rf_geno_pca) # numpy.ndarray
# rf_geno_pca.shape
rf_geno_pca_df = pd.DataFrame(rf_geno_pca, columns = ["geno.pca"+str(i+1) for i in range(10)])
rf_geno_pca_df

Unnamed: 0,geno.pca1,geno.pca2,geno.pca3,geno.pca4,geno.pca5,geno.pca6,geno.pca7,geno.pca8,geno.pca9,geno.pca10
0,7.213660,-16.498697,20.561995,8.542861,-18.410131,10.980970,-15.963006,0.958766,5.873796,14.788486
1,-9.989606,-16.825381,-4.357537,-7.872389,6.628885,-10.489426,-16.247969,9.861310,11.842599,-4.627810
2,-13.574061,1.205449,9.556019,14.077944,12.588271,0.332373,-0.378183,2.353160,-9.380940,-0.801807
3,16.818733,-9.220524,1.820089,-7.939193,0.339345,9.024804,2.199721,6.947666,7.619595,-8.736853
4,-5.896401,7.977406,-8.046678,-1.702258,-9.108070,-4.412113,-10.273039,-21.691902,-4.239996,-5.224723
...,...,...,...,...,...,...,...,...,...,...
523,-5.572941,19.278258,6.076185,0.622269,-1.116994,-3.237505,-7.534145,12.118379,-0.964740,3.531347
524,6.138554,-13.706802,21.274312,-1.682970,-0.572887,3.296269,-14.606322,-12.349279,6.504849,-9.649070
525,-30.172745,16.922978,-4.792928,4.912417,-8.512725,-9.823840,-2.041621,-0.393869,0.265076,23.155769
526,-13.608036,-8.462685,-4.777022,14.674497,12.723746,0.871732,-4.112552,-7.505947,-4.410521,1.718525


### Best CCI data

In [6]:
# https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/

# transform data
# CCI data was Standard Scaled, then PCA=20
rf_cci = std_scaler.fit_transform(cci)
# type(geno) # pandas.core.frame.DataFrame
# type(rf_geno_pca) # numpy.ndarray
# rf_geno_pca.shape
rf_cci_pca = pca20.fit_transform(rf_cci)
rf_cci_pca_df = pd.DataFrame(rf_cci_pca, columns = ["cci.pca"+str(i+1) for i in range(20)])
rf_cci_pca_df

Unnamed: 0,cci.pca1,cci.pca2,cci.pca3,cci.pca4,cci.pca5,cci.pca6,cci.pca7,cci.pca8,cci.pca9,cci.pca10,cci.pca11,cci.pca12,cci.pca13,cci.pca14,cci.pca15,cci.pca16,cci.pca17,cci.pca18,cci.pca19,cci.pca20
0,1.335592,-6.789915,1.048987,-1.736927,-2.123739,1.292711,-1.851911,0.061553,-1.822201,-6.834976,2.026637,1.094008,-2.670209,-1.928115,-0.234137,0.455804,0.647842,1.399029,-0.779771,-1.193530
1,-4.945149,1.009884,0.940003,0.254810,-1.636443,0.257140,2.313925,-4.028949,-1.964814,2.967403,5.537388,2.545521,0.859426,-0.285615,-0.170773,1.465872,-0.090454,-0.839110,-0.275114,-0.635978
2,-26.242413,1.078157,1.332191,-0.695361,-1.484651,1.508914,-0.965964,0.440998,2.504548,-1.254498,-1.894674,-0.059860,-0.327768,1.820282,-2.273893,0.694983,-0.583823,-0.273892,1.667046,0.558325
3,-7.666948,-2.292865,-1.422330,-0.656853,1.458433,-4.491205,3.995904,1.429170,-3.780926,2.488691,-0.942146,-1.323653,-1.058496,-1.734732,-1.627571,2.197975,1.604021,4.335921,-2.969251,-0.484425
4,-6.845544,-0.567426,-9.249071,3.928707,8.536751,-2.767837,-1.159049,1.150097,-2.406042,-1.541368,0.773050,-3.441869,-6.040627,4.747461,4.726089,6.062568,10.320156,-0.475070,-2.617314,1.747974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,-8.889763,-0.681525,-5.958176,2.335113,5.751193,-3.238812,1.909692,-0.873490,-1.739273,2.253832,-12.999722,2.414518,1.019538,-1.432324,-2.579924,1.876714,3.665630,1.260499,3.081466,0.306862
524,-13.502814,1.890974,-6.959819,3.123119,6.831769,-3.557247,0.913614,0.537172,-5.281811,-0.604657,-1.444798,-2.028431,-8.670254,3.234522,-2.027313,-2.855602,-1.532878,-0.550825,3.513442,-1.855339
525,-2.195734,-4.504904,4.617262,-2.560146,-5.044816,1.615630,-0.438655,-2.865449,-2.665402,1.626432,0.684614,0.960808,-0.685020,0.514889,0.986220,-0.249347,0.670405,0.146486,0.048182,0.692700
526,6.566224,-6.620025,1.732992,-2.211693,-2.204531,-1.429578,1.718746,-1.339054,-2.575437,-1.943765,0.705298,5.340048,-2.228554,-2.171471,1.240193,-1.154545,0.229168,0.297460,-0.230405,0.448503


### Put the combined data together (run 7)

In [7]:
cdata7 = pd.concat([rf_geno_pca_df, rf_cci_pca_df], axis=1)
seed = 0
X_train, X_test, y_train, y_test = train_test_split(cdata7, pheno, test_size=0.20, random_state=seed)
cdata7.shape

(528, 30)

In [8]:
rf_pipeline3 = Pipeline(steps = [("rf", RandomForestClassifier(n_estimators=100,
                                                              max_depth=None,
                                                              criterion="gini"))])

In [13]:
grid_search_rf3 = GridSearchCV(estimator=rf_pipeline3,
                              param_grid={"rf__n_estimators": [100, 150, 200, 250, 300],
                                          "rf__criterion": ["gini", "entropy", "log_loss"],
                                          "rf__max_depth": [None, 2, 4, 6, 8, 10]},
                              scoring="balanced_accuracy", # scoring can be "accuracy", "roc_auc", "f1"
                              return_train_score=True, # or else you'll never see it!
                              cv=KFold(n_splits=5, shuffle=True, random_state=0),
                              verbose = 3) # to get a consistent set of settings

grid_search_rf3.fit(cdata7, pheno["label"])
grid_search_rf3.best_params_


Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.596) total time=   0.1s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.580) total time=   0.1s
[CV 3/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.633) total time=   0.1s
[CV 4/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.672) total time=   0.1s
[CV 5/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=100;, score=(train=1.000, test=0.724) total time=   0.1s
[CV 1/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.542) total time=   0.2s
[CV 2/5] END rf__criterion=gini, rf__max_depth=None, rf__n_estimators=150;, score=(train=1.000, test=0.615) total time=   0.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=No

[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.682, test=0.531) total time=   0.2s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.685, test=0.538) total time=   0.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.668, test=0.541) total time=   0.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.733, test=0.583) total time=   0.2s
[CV 5/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=250;, score=(train=0.688, test=0.601) total time=   0.2s
[CV 1/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.678, test=0.531) total time=   0.2s
[CV 2/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.694, test=0.538) total time=   0.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=4, rf__n_estimators=300;, score=(train=0.681, test=0.526) total time=   0.2s
[CV 4/5]

[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.996, test=0.627) total time=   0.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.996, test=0.624) total time=   0.1s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=150;, score=(train=0.996, test=0.655) total time=   0.2s
[CV 1/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.565) total time=   0.2s
[CV 2/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.986, test=0.595) total time=   0.2s
[CV 3/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.607) total time=   0.2s
[CV 4/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=0.996, test=0.630) total time=   0.2s
[CV 5/5] END rf__criterion=gini, rf__max_depth=10, rf__n_estimators=200;, score=(train=1.000, test=0.656) total time=   0.2s


[CV 3/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.513, test=0.500) total time=   0.2s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.541, test=0.529) total time=   0.2s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=2, rf__n_estimators=300;, score=(train=0.532, test=0.520) total time=   0.2s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.678, test=0.524) total time=   0.1s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.653, test=0.513) total time=   0.1s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.650, test=0.520) total time=   0.1s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.711, test=0.529) total time=   0.1s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=100;, score=(train=0.680, test=0.588) tot

[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.965, test=0.594) total time=   0.2s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.989, test=0.624) total time=   0.2s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=200;, score=(train=0.972, test=0.647) total time=   0.2s
[CV 1/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.992, test=0.549) total time=   0.3s
[CV 2/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.973, test=0.577) total time=   0.3s
[CV 3/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.969, test=0.580) total time=   0.3s
[CV 4/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.989, test=0.654) total time=   0.3s
[CV 5/5] END rf__criterion=entropy, rf__max_depth=8, rf__n_estimators=250;, score=(train=0.968, test=0.629) tot

[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.522, test=0.514) total time=   0.1s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.571, test=0.500) total time=   0.1s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=100;, score=(train=0.540, test=0.514) total time=   0.1s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.538, test=0.500) total time=   0.1s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.500, test=0.500) total time=   0.1s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.522, test=0.500) total time=   0.1s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.534, test=0.500) total time=   0.1s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=2, rf__n_estimators=150;, score=(train=0.528, test=0.

[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.836, test=0.553) total time=   0.3s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.887, test=0.612) total time=   0.3s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=250;, score=(train=0.856, test=0.601) total time=   0.3s
[CV 1/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.903, test=0.549) total time=   0.3s
[CV 2/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.847, test=0.538) total time=   0.3s
[CV 3/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.867, test=0.587) total time=   0.3s
[CV 4/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.902, test=0.589) total time=   0.3s
[CV 5/5] END rf__criterion=log_loss, rf__max_depth=6, rf__n_estimators=300;, score=(train=0.864, test=0.

{'rf__criterion': 'gini', 'rf__max_depth': None, 'rf__n_estimators': 100}

### 5-fold CV (run 7)

In [15]:
rf_scores3 = cross_validate(rf_pipeline3, cdata7, pheno["label"], cv=5, scoring=["accuracy",
                                                                                 "roc_auc",
                                                                                 "f1",
                                                                                 "balanced_accuracy"])
rf_scores3

{'fit_time': array([0.12351823, 0.10113406, 0.10100245, 0.09751892, 0.09828496]),
 'score_time': array([0.00794101, 0.00751686, 0.00743294, 0.00736403, 0.00774813]),
 'test_accuracy': array([0.73584906, 0.78301887, 0.73584906, 0.8       , 0.73333333]),
 'test_roc_auc': array([0.75504386, 0.78157895, 0.74824561, 0.77533333, 0.688     ]),
 'test_f1': array([0.8372093 , 0.86390533, 0.83333333, 0.8742515 , 0.83333333]),
 'test_balanced_accuracy': array([0.57368421, 0.64692982, 0.59385965, 0.67      , 0.58333333])}

In [16]:
print("ACC")
print("mean =", np.mean(rf_scores3["test_accuracy"]))
print("std =", np.std(rf_scores3["test_accuracy"]), "\n")
print("AUC")
print("mean =", np.mean(rf_scores3["test_roc_auc"]))
print("std =", np.std(rf_scores3["test_roc_auc"]), "\n")
print("F1")
print("mean =", np.mean(rf_scores3["test_f1"]))
print("std =", np.std(rf_scores3["test_f1"]), "\n")
print("BAS")
print("mean =", np.mean(rf_scores3["test_balanced_accuracy"]))
print("std =", np.std(rf_scores3["test_balanced_accuracy"]), "\n")

ACC
mean = 0.7576100628930817
std = 0.028209773230631768 

AUC
mean = 0.7496403508771929
std = 0.033199034165142575 

F1
mean = 0.8484065582884046
std = 0.017250828402379705 

BAS
mean = 0.613561403508772
std = 0.037923218666198535 



# So, how well/poorly did that go?

In [33]:
y_pred = cross_val_predict(rf_pipeline3, cdata7, pheno["label"], cv=5)

In [34]:
answers = pheno["label"].to_numpy()

In [35]:
true_pos = 0
false_neg = 0
false_pos = 0
true_neg = 0

for i in range(len(answers)):
    # print(i, answers[i], y_pred[i])
    if (answers[i]==1 and y_pred[i]==1):
        # print("this statement should only print once before the next iteration (1)")
        true_pos += 1
    
    elif (answers[i]==1 and y_pred[i]==0):
        # print("this statement should only print once before the next iteration (2)")
        false_neg += 1
    
    elif (answers[i]==0 and y_pred[i]==1):
        # print("this statement should only print once before the next iteration (3)")
        false_pos += 1
    
    elif (answers[i]==0 and y_pred[i]==0):
        # print("this statement should only print once before the next iteration (4)")
        true_neg += 1
    
    else:
        print("Something went wrong.")
        break
    
print(true_pos, "true positives")
print(false_neg, "false negatives")
print(false_pos, "false positives")
print(true_neg, "true negatives")
print("Sensitivity =", true_pos / (true_pos + false_neg))
print("Specificity =", true_neg / (true_neg + false_pos))
print("PPV =", true_pos / (true_pos + false_pos))
print("NPV =", true_neg / (true_neg + false_neg))

352 true positives
26 false negatives
114 false positives
36 true negatives
Sensitivity = 0.9312169312169312
Specificity = 0.24
PPV = 0.7553648068669528
NPV = 0.5806451612903226


In [39]:
print(classification_report(pheno["label"], y_pred))

              precision    recall  f1-score   support

           0       0.53      0.29      0.38       150
           1       0.76      0.90      0.82       378

    accuracy                           0.73       528
   macro avg       0.65      0.60      0.60       528
weighted avg       0.70      0.73      0.70       528



In [39]:
47/72

0.6527777777777778

# 8. Lasso

In [22]:
pheno["label"].value_counts()

# macro f1 will take averages of both the majority and minority class

1    238
0     82
Name: label, dtype: int64

In [18]:
ls_pipeline = Pipeline(steps = [("ls", Lasso())])

In [19]:
ls_scores = cross_validate(ls_pipeline, geno, pheno["label"], cv=5, scoring=["accuracy",
                                                                             "roc_auc",
                                                                             "f1",
                                                                             "balanced_accuracy"])
ls_scores


Traceback (most recent call last):
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 95, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and continuous targets

Trace

Traceback (most recent call last):
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 95, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and continuous targets

Trace

Traceback (most recent call last):
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/Users/ryanburczak/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 95, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of binary and continuous targets

Trace

{'fit_time': array([0.06732988, 0.06250906, 0.03358412, 0.07156301, 0.13512087]),
 'score_time': array([0.10115719, 0.08077312, 0.05511785, 0.06347418, 0.05929613]),
 'test_accuracy': array([nan, nan, nan, nan, nan]),
 'test_roc_auc': array([0.5, 0.5, 0.5, 0.5, 0.5]),
 'test_f1': array([nan, nan, nan, nan, nan]),
 'test_balanced_accuracy': array([nan, nan, nan, nan, nan])}

In [None]:
print("ACC")
print("mean =", np.mean(rf_scores["test_accuracy"]))
print("std =", np.std(rf_scores["test_accuracy"]), "\n")
print("AUC")
print("mean =", np.mean(rf_scores["test_roc_auc"]))
print("std =", np.std(rf_scores["test_roc_auc"]), "\n")
print("F1")
print("mean =", np.mean(rf_scores["test_f1"]))
print("std =", np.std(rf_scores["test_f1"]), "\n")
print("BAS")
print("mean =", np.mean(rf_scores["test_balanced_accuracy"]))
print("std =", np.std(rf_scores["test_balanced_accuracy"]), "\n")