In [24]:
import numpy as np
seed = 42
np.random.seed(seed)
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score

In [25]:
import warnings
warnings.filterwarnings("ignore")

In [26]:
# import yolo outputs DONE
# import mobilenet outputs DONE
# import densenet outputs 

In [27]:
m_mobilenet_train = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2/m_mobilenet_train.csv").drop(columns=['Unnamed: 0'])
m_mobilenet_val = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2/m_mobilenet_val.csv").drop(columns=['Unnamed: 0'])
m_mobilenet_test = pd.read_csv("/kaggle/input/fork-of-koa-mobilenetv2/m_mobilenet_test.csv").drop(columns=['Unnamed: 0'])

In [29]:
m_mobilenet_train.rename(columns = {'FileName': 'FilePath'}, inplace=True)
m_mobilenet_val.rename(columns = {'FileName': 'FilePath'}, inplace=True)
m_mobilenet_test.rename(columns = {'FileName': 'FilePath'}, inplace=True)

In [30]:
m_yolov8_train = pd.read_csv("/kaggle/input/koa-yolov8-preds/m_yolov8_train.csv").drop(columns=['Unnamed: 0'])
m_yolov8_val = pd.read_csv("/kaggle/input/koa-yolov8-preds/m_yolov8_val.csv").drop(columns=['Unnamed: 0'])
m_yolov8_test = pd.read_csv("/kaggle/input/koa-yolov8-preds/m_yolov8_test.csv").drop(columns=['Unnamed: 0'])

In [31]:
m_densenet_train = pd.read_csv("/kaggle/input/koa-densenet-preds/m_densenet_train.csv").drop(columns=['Unnamed: 0'])
m_densenet_val = pd.read_csv("/kaggle/input/koa-densenet-preds/m_densenet_val.csv").drop(columns=['Unnamed: 0'])
m_densenet_test = pd.read_csv("/kaggle/input/koa-densenet-preds/m_densenet_test.csv").drop(columns=['Unnamed: 0'])

In [32]:
m_densenet_train.rename(columns = {'FileName': 'FilePath'}, inplace=True)
m_densenet_val.rename(columns = {'FileName': 'FilePath'}, inplace=True)
m_densenet_test.rename(columns = {'FileName': 'FilePath'}, inplace=True)

In [33]:
train = (m_mobilenet_train.merge(m_yolov8_train)).merge(m_densenet_train)
val = (m_mobilenet_val.merge(m_yolov8_val)).merge(m_densenet_val)
test = (m_mobilenet_test.merge(m_yolov8_test)).merge(m_densenet_test)

In [34]:
X_train = train[['m_0', 'm_1', 'm_2', 'm_3', 'm_4', 'y_0', 'y_1', 'y_2', 'y_3', 'y_4','d_0', 'd_1', 'd_2', 'd_3', 'd_4']]
X_val = val[['m_0', 'm_1', 'm_2', 'm_3', 'm_4', 'y_0', 'y_1', 'y_2', 'y_3', 'y_4','d_0', 'd_1', 'd_2', 'd_3', 'd_4']]
X_test = test[['m_0', 'm_1', 'm_2', 'm_3', 'm_4', 'y_0', 'y_1', 'y_2', 'y_3', 'y_4','d_0', 'd_1', 'd_2', 'd_3', 'd_4']]
y_train = train[['y_true']]
y_val = val[['y_true']]
y_test = test[['y_true']]

In [35]:
logModel = LogisticRegression(n_jobs = -1)

In [36]:
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0).ravel()
pds = PredefinedSplit(test_fold = split_index)

In [37]:
distributions = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : 10**np.linspace(-2,2,10),
    'solver' : ['lbfgs','liblinear','saga'],
    'max_iter' : [100, 1000,2500]
}






In [38]:
#clf = RandomizedSearchCV(rf, distributions, random_state=seed, n_jobs=-1, cv=3, verbose=1)
#clf = GridSearchCV(logModel, distributions, cv = 3, verbose=True, n_jobs=-1)
clf = GridSearchCV(estimator = logModel,
                   cv=pds,
                   param_grid=distributions, 
                   n_jobs=-1)

In [41]:
clf.fit(X,y.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [43]:
clf.best_estimator_

In [49]:
print("Testing on training set:")
print("Accuracy: ", accuracy_score(y_train,clf.best_estimator_.predict(X_train)).round(3))
print("Balanced Accuracy Score: ",balanced_accuracy_score(y_train,clf.predict(X_train)).round(3))
print("AUC:", roc_auc_score(y_train,clf.best_estimator_.predict_proba(X_train),multi_class='ovr').round(3))

Testing on training set:
Accuracy:  0.852
Balanced Accuracy Score:  0.859
AUC: 0.972


In [50]:
print("Testing on validation set:")
print("Accuracy: ", accuracy_score(y_val,clf.best_estimator_.predict(X_val)).round(3))
print("Balanced Accuracy Score: ",balanced_accuracy_score(y_val,clf.predict(X_val)).round(3))
print("AUC:", roc_auc_score(y_val,clf.best_estimator_.predict_proba(X_val),multi_class='ovr').round(3))

Testing on validation set:
Accuracy:  0.632
Balanced Accuracy Score:  0.637
AUC: 0.854


In [51]:
print("Testing on testing set:")
print("Accuracy: ", accuracy_score(y_test,clf.best_estimator_.predict(X_test)).round(3))
print("Balanced Accuracy Score: ",balanced_accuracy_score(y_test,clf.predict(X_test)).round(3))
print("AUC:", roc_auc_score(y_test,clf.best_estimator_.predict_proba(X_test),multi_class='ovr').round(3))

Testing on testing set:
Accuracy:  0.71
Balanced Accuracy Score:  0.714
AUC: 0.91
