In [1]:
import numpy as np
seed = 42
np.random.seed(seed)
import pandas as pd
from scipy.stats import uniform, randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import PredefinedSplit, GridSearchCV, RandomizedSearchCV
import xgboost as xgb

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# import yolo outputs DONE
# import mobilenet outputs DONE
# import densenet outputs 

In [4]:
b_mobilenet_train = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_train.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_val = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_val.csv").drop(columns=['Unnamed: 0'])
b_mobilenet_test = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2-2/b_mobilenet_test.csv").drop(columns=['Unnamed: 0'])

In [5]:
b_mobilenet_train.rename(columns = {'FileName': 'FilePath'}, inplace=True)
b_mobilenet_val.rename(columns = {'FileName': 'FilePath'}, inplace=True)
b_mobilenet_test.rename(columns = {'FileName': 'FilePath'}, inplace=True)

In [6]:
b_yolov8_train = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_train.csv").drop(columns=['Unnamed: 0'])
b_yolov8_val = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_val.csv").drop(columns=['Unnamed: 0'])
b_yolov8_test = pd.read_csv("/kaggle/input/koa-yolov8-preds/b_yolov8_test.csv").drop(columns=['Unnamed: 0'])

In [7]:
b_densenet_train = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_train.csv").drop(columns=['Unnamed: 0'])
b_densenet_val = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_val.csv").drop(columns=['Unnamed: 0'])
b_densenet_test = pd.read_csv("/kaggle/input/koa-densenet-preds/b_densenet_test.csv").drop(columns=['Unnamed: 0'])

In [8]:
b_densenet_train.rename(columns = {'FileName': 'FilePath'}, inplace=True)
b_densenet_val.rename(columns = {'FileName': 'FilePath'}, inplace=True)
b_densenet_test.rename(columns = {'FileName': 'FilePath'}, inplace=True)

In [9]:
train = (b_mobilenet_train.merge(b_yolov8_train)).merge(b_densenet_train)
val = b_mobilenet_val.merge(b_yolov8_val).merge(b_densenet_val)
test = b_mobilenet_test.merge(b_yolov8_test).merge(b_densenet_test)

In [10]:
X_train = train[['m','m_0','d']]
X_val = val[['m','m_0','d']]
X_test = test[['m','m_0','d']]
y_train = train[['y_true']]
y_val = val[['y_true']]
y_test = test[['y_true']]

In [11]:
xgb_model = xgb.XGBClassifier(n_jobs = -1)

In [12]:
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0).ravel()
pds = PredefinedSplit(test_fold = split_index)

In [13]:
distributions = {
    'max_depth': randint(1, 10),
    'learning_rate': uniform(0.001, 0.1),
    'n_estimators':randint(50, 200)
}

In [14]:
clf = RandomizedSearchCV(xgb_model,
                   cv=pds,
                   param_distributions=distributions,
                   random_state=42,
                   n_iter=200,    
                   n_jobs=-1)

In [15]:
clf.fit(X,y)


In [16]:
clf.best_estimator_

In [17]:
clf.best_params_

{'learning_rate': 0.002439348862975587, 'max_depth': 5, 'n_estimators': 96}

In [20]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,clf.best_estimator_.predict_proba(X_train)[:,1]).round(3))
print("Testing on validation set:")
print("Accuracy: ", accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,clf.best_estimator_.predict_proba(X_val)[:,1]).round(3))
print("Testing on testing set:")
print("Accuracy: ", accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("Balanced Accuracy: ", balanced_accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("AUC:", roc_auc_score(y_test,clf.best_estimator_.predict_proba(X_test)[:,1]).round(3))

Testing on training set:
Accuracy:  0.949
Balanced Accuracy:  0.941
AUC: 0.986
Testing on validation set:
Accuracy:  0.838
Balanced Accuracy:  0.82
AUC: 0.902
Testing on testing set:
Accuracy:  0.873
Balanced Accuracy:  0.861
AUC: 0.941
