# Mount Drive

In [None]:
#Allows dataset from drive to be utilized
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score, train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import confusion_matrix, make_scorer, recall_score, precision_score, f1_score, accuracy_score, roc_auc_score, auc, plot_roc_curve
from sklearn.preprocessing import MinMaxScaler 
from xgboost import XGBClassifier
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import Pipeline
from scipy.stats import mode
from sklearn.dummy import DummyClassifier
import statistics
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import RandomOverSampler,SMOTE
import matplotlib.pyplot as plt
from imblearn.pipeline import Pipeline
from sklearn.model_selection import permutation_test_score

In [None]:
import imblearn
print('imblearn: {}'.format(imblearn.__version__))

# Import Dataset

In [None]:
#Import DataFrame from .csv file
df = pd.read_csv(DATASET_LOCATION)

#Creating labels
x = df.drop("mucinous", axis=1); #Entire dataset
Y = df["mucinous"].copy();
feature_cols = x.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(x)
print(X.shape)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=12,test_size=.2,shuffle=True,stratify=Y)

In [None]:
import sys
print(sys.version)
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

## Import Strictly Texture Feature Dataset

In [None]:
#Import DataFrame from .csv file
df = pd.read_csv(DATASET_LOCATION)

#Creating labels
x = df.drop("mucinous", axis=1); #Entire dataset
Y = df["mucinous"].copy();
feature_cols = x.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(x)
print(X.shape)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=12,test_size=.2,shuffle=True,stratify=Y)

## Import Non - Texture Feature Dataset

In [None]:
#Import DataFrame from .csv file
df = pd.read_csv(DATASET_LOCATION)

#Creating labels
x = df.drop("mucinous", axis=1); #Entire dataset
Y = df["mucinous"].copy();
feature_cols = x.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(x)
print(X.shape)



X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=12,test_size=.2,shuffle=True,stratify=Y)

# Hyperparameter Optimization

Full Feature Set Hyperparameters (5 Stratified CV): depth = 3, estimators = 11, weight scale = 

In [None]:
# estimate scale_pos_weight value
estimate = 1/1 # Here we set the estimate variable to the value of (minority class)/(majority class) as a starting point for
                # exploring different scale_pos_weight values
print('Estimate: %.3f' % estimate)

In [None]:
# Hyper-parameter Optimization
## Using hyper-parameter optimization, we found the best hyperparameters for
## our various models. 

## The specific hyperparameter values seen throughout the  notebook may not 
## necessarily be representative of exact hyperparameters used to achieve values
##  in manuscript

metric=make_scorer(roc_auc_score)
weightlist= np.arange(.1, .4, 0.05).tolist()
weightlist.append(estimate)
cv = StratifiedKFold(n_splits=5, shuffle=True)
model = XGBClassifier()

# Based on available compute time, set values for each hyperparameter in larger
# increments and becoming more granular on subsequent runs as we narrow down
# optimal parameter values
param_grid = [{'n_estimators': [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],
                  'max_depth': [3,4,5,6],
                  'scale_pos_weight': weightlist,
               }]
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=metric, )
grid_search.fit(X, Y)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

# Baseline Metrics from Various Models

## Random Forest

In [None]:
## Metrics 
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold_resample, y_train_fold_resample = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    model = RandomForestClassifier(n_estimators=8,max_depth=9)
    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)
    
    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## P - value
model = RandomForestClassifier(n_estimators=8,max_depth=9)
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
_, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000)
_, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=1000)
print(pvalue)
print(pvalue2)

## Logistic Regression

In [None]:
from sklearn.model_selection import cross_val_score,LeaveOneOut
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.linear_model import LogisticRegression


Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(200):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    model = LogisticRegression(class_weight='balanced')
    model.fit(X_train_fold,y_train_fold)
    pt = model.predict(X_val_fold)
    # for i in range(len(pt)):
    #   if pt[i]> float(1/2):
    #     pt[i] = 1
    #   else:
    #     pt[i] = 0
    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value
model = LogisticRegression(class_weight='balanced')
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
_, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000)
_, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=1000)
print(pvalue)
print(pvalue2)

## SVM

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn import svm

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    model = svm.SVC()
    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value 
from sklearn import svm
model = svm.SVC()
cv = StratifiedKFold(n_splits=5, shuffle=True)
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
# _, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000)
# print(pvalue)
_, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=100)
print(pvalue2)

## MLP

### "Wide"

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold_resample,y_train_fold_resample = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    model = MLPClassifier(hidden_layer_sizes=(512, 512, 512), random_state=1)

    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value 
from sklearn.neural_network import MLPClassifier
cv = StratifiedKFold(n_splits=5)
model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(512, 512, 512), random_state=1)
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
# _, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000, n_jobs=-1)
# print(pvalue)
_, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=1000, n_jobs=-1)
print(pvalue2)

### "Deep"

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold_resample,y_train_fold_resample = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]

    model = MLPClassifier(hidden_layer_sizes=(100,100,100,100,100,100,100,100,100,100), random_state=1)

    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value 
from sklearn.neural_network import MLPClassifier
cv = StratifiedKFold(n_splits=5)
model = MLPClassifier(hidden_layer_sizes=(100,100,100,100,100,100,100,100,100,100))
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
_, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000)
print(pvalue)
_, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=1000)
print(pvalue2)

### "Middle"

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold_resample,y_train_fold_resample = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    
    model = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 64), random_state=1, max_iter=400)

    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value
from sklearn.neural_network import MLPClassifier
cv = StratifiedKFold(n_splits=5)
model = MLPClassifier(hidden_layer_sizes=(512, 256, 128, 64, 64), random_state=1, max_iter=400)
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
_, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000, n_jobs=-1)
print(pvalue)
_, _, pvalue2 = permutation_test_score(model, X, -Y, scoring=g_mean_metric, cv=cv, n_permutations=1000, n_jobs=-1)
print(pvalue2)

## kNN

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

for j in [3,5,7,9,11]:
  Precisions = []
  Recalls = []
  F1s = []
  G_means = []
  accuracy = []
  AUC = []
  Specificities = []

  for i in range(500):
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    for train_fold_index, val_fold_index in cv.split(X,Y):
      X_train_fold_resample,y_train_fold_resample = X[train_fold_index], Y[train_fold_index]
      X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]

      model = KNeighborsClassifier(n_neighbors=j)

      model.fit(X_train_fold_resample,y_train_fold_resample)
      pt = model.predict(X_val_fold)

      tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
      specificity = tn / (tn+fp)
      Specificities.append(specificity)
      Precisions.append(precision_score(y_val_fold,pt))
      Recalls.append(recall_score(y_val_fold,pt))
      F1s.append(f1_score(y_val_fold,pt))
      G_means.append(geometric_mean_score(y_val_fold,pt))
      accuracy.append(accuracy_score(y_val_fold,pt))
      AUC.append(roc_auc_score(y_val_fold,pt))
  print("k = "+ str(j))
  print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
  print('Sensitivity/Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
  print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
  print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
  print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
  print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
  print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

In [None]:
## p value 
for j in [3,5,7,9,11]:
  model = KNeighborsClassifier(n_neighbors=j)
  AUC_metric = make_scorer(roc_auc_score)
  g_mean_metric = make_scorer(geometric_mean_score)
  _, _, pvalue = permutation_test_score(model, X, Y, scoring=AUC_metric, cv=cv, n_permutations=1000)
  _, _, pvalue2 = permutation_test_score(model, X, Y, scoring=g_mean_metric, cv=cv, n_permutations=1000)
  print("k = %i, p-value (AUC): %f, p-value (gmean) %f"% (j, pvalue, pvalue2))

## XGBoost

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score

Precisions = []
Recalls = []
Specificities = []
F1s = []
G_means = []
accuracy = []
AUC = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    model = XGBClassifier(max_depth=3, n_estimators=25, scale_pos_weight=.2)
    model.fit(X_train_fold,y_train_fold)
    pt = model.predict(X_val_fold)

    #print("confusion_matrix:")
    #print(confusion_matrix(y_val_fold,pt))
    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    Specificities.append(tn / (tn+fp))
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision - Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall - Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

## XGBoost with Undersampling

In [None]:
# K-fold

cv1 = StratifiedKFold(n_splits=3, random_state=12, shuffle=True)

Precisons = []
Recalls = []
F1s = []
G_means = []
accuracy = []

cc2 = ClusterCentroids(random_state=12)
print(X.shape,Y.shape)
X_under, Y_under = cc2.fit_resample(X,Y)
print(X_under.shape,Y_under.shape)

for train_fold_index, val_fold_index in cv1.split(X_under,Y_under):
  X_train_fold,y_train_fold = X_under[train_fold_index], Y_under[train_fold_index]
  X_val_fold, y_val_fold = X_under[val_fold_index], Y_under[val_fold_index]

  model = XGBClassifier(n_estimators=32, max_depth=3, scale_pos_weight=.2875)
  model.fit(X_train_fold,y_train_fold)
  pt = model.predict(X_val_fold)

  #print("confusion_matrix:")
  #print(confusion_matrix(y_val_fold,pt))
  tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
  specificity = tn / (tn+fp)
  Specificities.append(specificity)
  Precisons.append(precision_score(y_val_fold,pt))
  Recalls.append(recall_score(y_val_fold,pt))
  F1s.append(f1_score(y_val_fold,pt))
  G_means.append(geometric_mean_score(y_val_fold,pt))
  accuracy.append(accuracy_score(y_val_fold,pt))

print('Precision: ',mean(Precisons))
print('Recall: ',mean(Recalls))
print('F1: ',mean(F1s))
print('G_mean: ',mean(G_means))
print('accuracy: ', mean(accuracy))


## Oversampling for XGBoost

### SMOTE

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []
Specificities = []

for i in range(500):
  cv = StratifiedKFold(n_splits=5, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    smoter = SMOTE()
    X_train_fold_resample, y_train_fold_resample = smoter.fit_resample(X_train_fold,y_train_fold)
    model = XGBClassifier(max_depth=3, n_estimators=11, scale_pos_weight=.25)
    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    #print("confusion_matrix:")
    #print(confusion_matrix(y_val_fold,pt))
    tn, fp, fn, tp = confusion_matrix(y_val_fold, pt).ravel()
    specificity = tn / (tn+fp)
    Specificities.append(specificity)
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %f Standard Deviation: %f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Sensitivity/Recall- Mean:  %f Standard Deviation: %f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(Specificities), statistics.pstdev(Specificities)))
print('F1- Mean:  %f Standard Deviation: %f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %f Standard Deviation: %f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %f Standard Deviation: %f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %f Standard Deviation: %f' % (mean(AUC), statistics.pstdev(AUC)))

### Random Oversampling

In [None]:
# K-fold
from statistics import mean as mean
from sklearn.metrics import roc_auc_score

Precisions = []
Recalls = []
F1s = []
G_means = []
accuracy = []
AUC = []

for i in range(500):
  cv = StratifiedKFold(n_splits=3, shuffle=True)
  for train_fold_index, val_fold_index in cv.split(X,Y):
    X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
    X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
    ros = RandomOverSampler()
    X_train_fold_resample, y_train_fold_resample = ros.fit_resample(X_train_fold,y_train_fold)
    model = XGBClassifier(n_estimators=11, max_depth=3, scale_pos_weight=.25)
    model.fit(X_train_fold_resample,y_train_fold_resample)
    pt = model.predict(X_val_fold)

    #print("confusion_matrix:")
    #print(confusion_matrix(y_val_fold,pt))
    Precisions.append(precision_score(y_val_fold,pt))
    Recalls.append(recall_score(y_val_fold,pt))
    F1s.append(f1_score(y_val_fold,pt))
    G_means.append(geometric_mean_score(y_val_fold,pt))
    accuracy.append(accuracy_score(y_val_fold,pt))
    AUC.append(roc_auc_score(y_val_fold,pt))

print('Precision- Mean:  %.3f Standard Deviation: %.3f' % (mean(Precisions), statistics.pstdev(Precisions)))
print('Recall- Mean:  %.3f Standard Deviation: %.3f' % (mean(Recalls), statistics.pstdev(Recalls)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(F1s), statistics.pstdev(F1s)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(G_means), statistics.pstdev(G_means)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(accuracy), statistics.pstdev(accuracy)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(AUC), statistics.pstdev(AUC)))

# Metrics for Naive Classifiers

## Majority Classifier


In [None]:
# Naive Classifier 
## Predicts the Majority (Mucinous) Class
## Source: https://machinelearningmastery.com/how-to-develop-and-evaluate-naive-classifier-strategies-using-probability/
##         https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics 

# predict the majority class
def majority_class(y):
	return mode(y)[0]

# make predictions
yhat = [1 for _ in range(len(Y))]
print(yhat)

tn, fp, fn, tp = confusion_matrix(Y, yhat).ravel()


# calculate Metrics
print('F1 : %.3f' % f1_score(Y, yhat))
print('Recall : %.3f' % recall_score(Y,yhat))
print('Precision : %.3f' % precision_score(Y,yhat))
print('Specificity : %.3f' % (tn/(tn+fp)))
print('ROC: %.3f' % roc_auc_score(Y, yhat))
print('G-Mean : %.3f' % geometric_mean_score(Y,yhat))
print('accuracy : %.3f' % accuracy_score(Y,yhat))

## Minority Classifier

In [None]:
# predict the majority class
def majority_class(y):
	return mode(y)[0]

# make predictions
yhat = [0 for _ in range(len(Y))] #Hardcoded for our model's distribution
print(yhat)

tn, fp, fn, tp = confusion_matrix(Y, yhat).ravel()

# calculate Metrics
print('F1 : %.3f' % f1_score(Y, yhat))
print('Recall : %.3f' % recall_score(Y,yhat))
print('Precision : %.3f' % precision_score(Y,yhat))
print('Specificity : %.3f' % (tn/(tn+fp)))
print('ROC: %.3f' % roc_auc_score(Y, yhat))
print('G-Mean : %.3f' % geometric_mean_score(Y,yhat))
print('accuracy : %.3f' % accuracy_score(Y,yhat))

## Random Guesser

In [None]:
from statistics import mean as mean
dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X, Y)
y_predicted = dummy_clf.predict(X)

f1= []
rcll = []
prc = []
gmean = []
acc = []
spec = []
roc = []

for i in range(1000):
  y_predicted = dummy_clf.predict(X)
  f1.append(f1_score(Y, y_predicted))
  rcll.append(recall_score(Y,y_predicted))
  prc.append(precision_score(Y,y_predicted))
  gmean.append(geometric_mean_score(Y,y_predicted))
  acc.append(accuracy_score(Y,y_predicted))
  tn, fp, fn, tp = confusion_matrix(Y, y_predicted).ravel()
  spec.append(tn/(tn+fp))
  roc.append(roc_auc_score(Y,y_predicted))

print('Precision - Mean:  %.3f Standard Deviation: %.3f' % (mean(prc), statistics.pstdev(prc)))
print('Sensitivity/Recall - Mean:  %.3f Standard Deviation: %.3f' % (mean(rcll), statistics.pstdev(rcll)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(spec), statistics.pstdev(spec)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(f1), statistics.pstdev(f1)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(gmean), statistics.pstdev(gmean)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(acc), statistics.pstdev(acc)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(roc), statistics.pstdev(roc)))

## Stratified Guesser

In [None]:
from statistics import mean as mean

dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X, Y)

f1= []
rcll = []
prc = []
gmean = []
acc = []
spec = []
roc = []

for i in range(1000):
  y_predicted = dummy_clf.predict(X)
  f1.append(f1_score(Y, y_predicted))
  rcll.append(recall_score(Y,y_predicted))
  prc.append(precision_score(Y,y_predicted))
  gmean.append(geometric_mean_score(Y,y_predicted))
  acc.append(accuracy_score(Y,y_predicted))
  tn, fp, fn, tp = confusion_matrix(Y, y_predicted).ravel()
  spec.append(tn/(tn+fp))
  roc.append(roc_auc_score(Y,y_predicted))

print('Precision - Mean:  %.3f Standard Deviation: %.3f' % (mean(prc), statistics.pstdev(prc)))
print('Sensitivity/Recall - Mean:  %.3f Standard Deviation: %.3f' % (mean(rcll), statistics.pstdev(rcll)))
print('Specificity - Mean:  %.3f Standard Deviation: %.3f' % (mean(spec), statistics.pstdev(spec)))
print('F1- Mean:  %.3f Standard Deviation: %.3f' % (mean(f1), statistics.pstdev(f1)))
print('G_mean- Mean:  %.3f Standard Deviation: %.3f' % (mean(gmean), statistics.pstdev(gmean)))
print('Accuracy- Mean:  %.3f Standard Deviation: %.3f' % (mean(acc), statistics.pstdev(acc)))
print('AUC Score- Mean:  %.3f Standard Deviation: %.3f' % (mean(roc), statistics.pstdev(roc)))

# P - Values for Models

In [None]:
# Datasets
## Full Feature Set
df = pd.read_csv(DATASET_LOCATION)
x_full = df.drop("mucinous", axis=1); #Entire dataset
Y_full = df["mucinous"].copy()
scaler = MinMaxScaler(feature_range=(0, 1))
X_full = scaler.fit_transform(x_full)

#Import Texture-Only Feature Set
df = pd.read_csv(DATASET_LOCATION)
x_texture = df.drop("mucinous", axis=1); #Entire dataset
Y_texture = df["mucinous"].copy();
scaler = MinMaxScaler(feature_range=(0, 1))
X_texture = scaler.fit_transform(x_texture)

# Models
## Naive
### Majority, Minority, random, stratified
majority = DummyClassifier(strategy='constant', constant=1) #strategy='most_frequent'
minority = DummyClassifier(strategy='constant', constant=0)
random = DummyClassifier(strategy='uniform', constant=1)
stratified = DummyClassifier(strategy='stratified', constant=1)
random.fit(X_full, Y_full)
stratified.fit(X_full, Y_full)

## ML
### SMOTE Full Feature, SMOTE Texture-Only, XGBoost Full, XGBoost Texture-only
XGBoost = XGBClassifier(n_estimators=11, max_depth=3, scale_pos_weight=.25)
SMOTE_XGBoost = Pipeline([
        ('sampling', SMOTE()),
        ('classification', XGBoost)
    ])

# Scoring
## Setup
cv = StratifiedKFold(n_splits=5, random_state=12, shuffle=True)
AUC_metric = make_scorer(roc_auc_score)
g_mean_metric = make_scorer(geometric_mean_score)
p_values_AUC = {}
p_values_g_mean = {}
titles = ["AUC p-value", "G-Mean p-value"]

## AUC
_, _, pvalue = permutation_test_score(majority, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["majority"] = pvalue
_, _, pvalue = permutation_test_score(minority, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["minority"] = pvalue
_, _, pvalue = permutation_test_score(random, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["random"] = pvalue
_, _, pvalue = permutation_test_score(stratified, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["stratified"] = pvalue
_, _, pvalue = permutation_test_score(XGBoost, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["XGBoost_Full"] = pvalue
_, _, pvalue = permutation_test_score(SMOTE_XGBoost, X_full, Y_full, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["SMOTE_Full"] = pvalue
_, _, pvalue = permutation_test_score(XGBoost, X_texture, Y_texture, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["XGBoost_Texture"] = pvalue
_, _, pvalue = permutation_test_score(SMOTE_XGBoost, X_texture, Y_texture, scoring=AUC_metric, cv=cv, n_permutations=1000)
p_values_AUC["SMOTE_Texture"] = pvalue

## G - Mean
_, _, pvalue = permutation_test_score(majority, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["majority"] = pvalue
_, _, pvalue = permutation_test_score(minority, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["minority"] = pvalue
_, _, pvalue = permutation_test_score(random, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["random"] = pvalue
_, _, pvalue = permutation_test_score(stratified, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["stratified"] = pvalue
score, _, pvalue = permutation_test_score(XGBoost, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["XGBoost_Full"] = pvalue
_, _, pvalue = permutation_test_score(SMOTE_XGBoost, X_full, Y_full, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["SMOTE_Full"] = pvalue
_, _, pvalue = permutation_test_score(XGBoost, X_texture, Y_texture, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["XGBoost_Texture"] = pvalue
_, _, pvalue = permutation_test_score(SMOTE_XGBoost, X_texture, Y_texture, scoring=g_mean_metric, cv=cv, n_permutations=1000)
p_values_g_mean["SMOTE_Texture"] = pvalue

# Output Table
print("AUC")
print(p_values_AUC)
print("G-Mean")
print(p_values_g_mean)

# Plots

## Plot Decision Bounds

In [None]:
#https://pierpaolo28.github.io/Projects/project6.html

from sklearn.decomposition import PCA
from itertools import product
pca = PCA(n_components=2,svd_solver='full')
X_pca = pca.fit_transform(X)

X_reduced, X_test_reduced, Y_Train, Y_Test = train_test_split(X_pca, Y, test_size=.2,shuffle=True,stratify=Y)

reduced_data = X_reduced

trainedmodel = XGBClassifier(n_estimators=7, max_depth=3, scale_pos_weight=.25).fit(reduced_data,Y_Train)

x_min, x_max = reduced_data[:, 0].min() - .5, reduced_data[:, 0].max() + .5
y_min, y_max = reduced_data[:, 1].min() - .5, reduced_data[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

Z = trainedmodel.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

arg_0 = np.where(Y_Train == 0)
arg_1 = np.where(Y_Train == 1)

plt.figure(figsize=(7.5,5))
plt.contourf(xx, yy, Z,cmap=plt.cm.coolwarm, alpha=0.4)
plt.scatter(reduced_data[arg_1, 0], reduced_data[arg_1, 1],
                              s=20, edgecolor='k', marker='^', label='Mucinous', c='purple')
plt.scatter(reduced_data[arg_0, 0], reduced_data[arg_0, 1],
                              s=20, edgecolor='k', c='yellow', label='Non-mucinous')
plt.title('XGBoost - Mucinous Classifier')
plt.legend(loc='upper right')
plt.show()

## Shap Model Visualization

In [None]:
#Import DataFrame from .csv file
df = pd.read_csv(DATASET_LOCATION)

#Creating labels
x = df.drop("mucinous", axis=1); #Entire dataset
Y = df["mucinous"].copy();
feature_cols = x.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(x)

model = XGBClassifier(max_depth=3, n_estimators=11, scale_pos_weight=.25)
model.fit(X,Y)
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.summary_plot(shap_values, x)

In [None]:
#Import DataFrame from .csv file
df = pd.read_csv(DATASET_LOCATION)

#Creating labels
x = df.drop("mucinous", axis=1); #Entire dataset
Y = df["mucinous"].copy();
feature_cols = x.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(x)
print(X.shape)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=12,test_size=.2,shuffle=True,stratify=Y)
#print(X_train.shape, X_test.shape)

model = XGBClassifier(max_depth=3, n_estimators=8, scale_pos_weight=.25)
model.fit(X,Y)
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.summary_plot(shap_values, x)



In [None]:
# load JS visualization code to notebook
shap.initjs()

model = XGBClassifier(max_depth=3, n_estimators=11, scale_pos_weight=.25)
model.fit(X,Y)

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], x.iloc[0,:], matplotlib=False)

## Curves

### PR Curves

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

import matplotlib.pyplot as plt
import numpy
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from numpy import interp
from xgboost import XGBClassifier

FOLDS = 10

f, axes = plt.subplots(figsize=(10,10))
k_fold = StratifiedKFold(n_splits=FOLDS, random_state=12, shuffle=True)
results = pd.DataFrame(columns=['training_score', 'test_score'])


y_realtot = []
y_probatot = []

precision_arraytot = []
threshold_arraytot=[]
recall_arraytot = np.linspace(0, 1, 100)

for j in range(10):
  y_real = []
  y_proba = []

  precision_array = []
  threshold_array=[]
  recall_array = np.linspace(0, 1, 100)
  for i, (train_index, test_index) in enumerate(k_fold.split(X,Y)):
    predictor = XGBClassifier(n_estimators=32, max_depth=3, scale_pos_weight=.2875)
      
    X_train_fold,y_train_fold = X[train_index], Y[train_index]
    X_val_fold, y_val_fold = X[test_index], Y[test_index]
    smoter = SMOTE(random_state=12)
    X_train_fold_resample, y_train_fold_resample = smoter.fit_resample(X_train_fold,y_train_fold)
      
      
    predictor.fit(X_train_fold_resample, y_train_fold_resample)
    pred_proba = predictor.predict_proba(X_val_fold)
    precision_fold, recall_fold, thresh = precision_recall_curve(y_val_fold, pred_proba[:,1])
    precision_fold, recall_fold, thresh = precision_fold[::-1], recall_fold[::-1], thresh[::-1]  # reverse order of results
    thresh = np.insert(thresh, 0, 1.0)
    precision_array = interp(recall_array, recall_fold, precision_fold)
    threshold_array = interp(recall_array, recall_fold, thresh)
    pr_auc = auc(recall_array, precision_array)

    lab_fold = 'Fold %d AUC=%.4f' % (i+1, pr_auc)
    #plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
    y_real.append(y_val_fold)
    y_proba.append(pred_proba[:,1])

  y_real = numpy.concatenate(y_real)
  y_proba = numpy.concatenate(y_proba)
  precision, recall, _ = precision_recall_curve(y_real, y_proba)
  lab_foldtot = 'PR %d AUC=%.4f' % (j+1, pr_auc)
  plt.plot(recall, precision, marker='.' ,alpha=0.3, label=lab_foldtot)
  y_realtot.append(y_real)
  y_probatot.append(y_proba)
  precision_arraytot = interp(recall_array, recall, precision)
  threshold_arraytot = interp(recall_array, recall, precision)
 #plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
#finsih 10 iterations.
y_realtot = numpy.concatenate(y_realtot)
y_probatot= numpy.concatenate(y_probatot)
precision, recall, _ = precision_recall_curve(y_realtot, y_probatot)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))

plt.plot(recall, precision, marker='.', lw=2,color='red', label=lab)
plt.legend(loc='lower left', fontsize=18)

lab = 'Overall AUC=%.4f' % (auc(recall, precision))
mean_precision = np.mean(precision)
mean_recall = np.mean(recall)
std_precision = np.std(precision)
print ("mean of precision: " )
print (mean_precision )

print ("Std Dev of precision: ")
print ( std_precision )
# print ("mean of recall: " )
# print (mean_precision )
axes.set_title('10 Indenpendent PR Curves of Random Forest Over 10 Folds Cross Validation', fontsize=18)

plt.fill_between(recall, precision + std_precision, precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.xlabel("Recall", fontsize=18)
plt.ylabel("Precision", fontsize=18)
plt.ylim((0,1))
plt.xlim((0,1))
plt.show()

f.savefig('result.png')
print (precision)
print (recall)
print (_)

### ROC

In [None]:
 ## ROC Curve for 5-Fold Cross Validation with SMOTE oversampling 
 # Source: https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/auto_examples/plot_roc_crossval.html

# #############################################################################
# Run classifier with cross-validation and plot ROC curves

from sklearn import metrics

df = pd.read_csv('/content/drive/My Drive/CT Analysis/Data Sets/mucinous_processed.csv')

#Creating labels
full_x = df.drop("mucinous", axis=1); #Entire dataset
full_Y = df["mucinous"].copy();

scaler = MinMaxScaler(feature_range=(0, 1))
full_x = scaler.fit_transform(full_x)

df = pd.read_csv('/content/drive/My Drive/CT Analysis/Data Sets/texture_feature_set_mucinous_processed.csv')

#Creating labels
texture_x = df.drop("mucinous", axis=1); #Entire dataset
texture_Y = df["mucinous"].copy();
scaler = MinMaxScaler(feature_range=(0, 1))
texture_x = scaler.fit_transform(texture_x)


cv = StratifiedKFold(n_splits=5, shuffle=True)
#classifier = RandomForestClassifier(n_estimators=25,max_depth=20, class_weight='balanced')
plt.rcParams["figure.figsize"] = [14,10]
tprs_full = []
aucs_full = []
mean_fpr_full = np.linspace(0, 1, 100)
tprs_text = []
aucs_text = []
mean_fpr_text = np.linspace(0, 1, 100)
fig, full = plt.subplots()
fig, text = plt.subplots()
fig, both = plt.subplots()

for j in range(500):
  for i, (train_fold_index, val_fold_index) in enumerate(cv.split(full_x, full_Y)):
    X_train_full,y_train_full = full_x[train_fold_index], full_Y[train_fold_index]
    X_val_full, y_val_full = full_x[val_fold_index], full_Y[val_fold_index]
    
    X_train_text,y_train_text = texture_x[train_fold_index], texture_Y[train_fold_index]
    X_val_text, y_val_text = texture_x[val_fold_index], texture_Y[val_fold_index]

    classifier_full = XGBClassifier(n_estimators=11, max_depth=3, scale_pos_weight=.25)
    classifier_full.fit(X_train_full,y_train_full)
    classifier_text = XGBClassifier(n_estimators=8, max_depth=3, scale_pos_weight=.25)
    classifier_text.fit(X_train_text,y_train_text)

    y_scores_full = classifier_full.predict_proba(X_val_full)[:, 1]
    fpr_full, tpr_full, thresholds_full = metrics.roc_curve(y_val_full, classifier_full.predict_proba(X_val_full)[:, 1])
    y_scores_text = classifier_text.predict_proba(X_val_text)[:, 1]
    fpr_text, tpr_text, thresholds_text = metrics.roc_curve(y_val_text, classifier_text.predict_proba(X_val_text)[:, 1])

    interp_tpr_full = np.interp(mean_fpr_full, fpr_full, tpr_full)
    interp_tpr_full[0] = 0.0
    tprs_full.append(interp_tpr_full)
    aucs_full.append(metrics.auc(fpr_full, tpr_full))

    interp_tpr_text = np.interp(mean_fpr_text, fpr_text, tpr_text)
    interp_tpr_text[0] = 0.0
    tprs_text.append(interp_tpr_text)
    aucs_text.append(metrics.auc(fpr_text, tpr_text))
    
### Full Feature Plot
full.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr_full = np.mean(tprs_full, axis=0)
mean_tpr_full[-1] = 1.0
mean_auc_full = auc(mean_fpr_full, mean_tpr_full)
std_auc_full = np.std(aucs_full)
full.plot(mean_fpr_full, mean_tpr_full, color='b',
        label=r'Mean ROC of Full Feature Set(AUC = %0.2f $\pm$ %0.2f)' % (mean_auc_full, std_auc_full),
        lw=2, alpha=.8)

std_tpr_full = np.std(tprs_full, axis=0)
tprs_upper_full = np.minimum(mean_tpr_full + std_tpr_full, 1)
tprs_lower_full = np.maximum(mean_tpr_full - std_tpr_full, 0)
full.fill_between(mean_fpr_full, tprs_lower_full, tprs_upper_full, color='blue', alpha=.1,
                label=r'$\pm$ 1 std. dev.')

full.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
full.legend(loc="lower right")

### Texture Only Plot
text.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)
mean_tpr_text = np.mean(tprs_text, axis=0)
mean_tpr_text[-1] = 1.0
mean_auc_text = auc(mean_fpr_text, mean_tpr_text)
std_auc_text = np.std(aucs_text)
text.plot(mean_fpr_text, mean_tpr_text, color='g',
        label=r'Mean ROC of Texture Feature Set(AUC = %0.2f $\pm$ %0.2f)' % (mean_auc_text, std_auc_text),
        lw=2, alpha=.8)

std_tpr_text = np.std(tprs_text, axis=0)
tprs_upper_text = np.minimum(mean_tpr_text + std_tpr_text, 1)
tprs_lower_text = np.maximum(mean_tpr_text - std_tpr_text, 0)
text.fill_between(mean_fpr_text, tprs_lower_text, tprs_upper_text, color='green', alpha=.1,
                label=r'$\pm$ 1 std. dev.')

text.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
text.legend(loc="lower right")

### Combined Plot
## Full Features
both.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr_full = np.mean(tprs_full, axis=0)
mean_tpr_full[-1] = 1.0
mean_auc_full = auc(mean_fpr_full, mean_tpr_full)
std_auc_full = np.std(aucs_full)
both.plot(mean_fpr_full, mean_tpr_full, color='b',
        label=r'Mean ROC of Full Feature Set(AUC = %0.2f $\pm$ %0.2f)' % (mean_auc_full, std_auc_full),
        lw=2, alpha=.8)

std_tpr_full = np.std(tprs_full, axis=0)
tprs_upper_full = np.minimum(mean_tpr_full + std_tpr_full, 1)
tprs_lower_full = np.maximum(mean_tpr_full - std_tpr_full, 0)
both.fill_between(mean_fpr_full, tprs_lower_full, tprs_upper_full, color='blue', alpha=.1,
                label=r'$\pm$ 1 std. dev.')

both.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
both.legend(loc="lower right")

## Texture Features
mean_tpr_text = np.mean(tprs_text, axis=0)
mean_tpr_text[-1] = 1.0
mean_auc_text = auc(mean_fpr_text, mean_tpr_text)
std_auc_text = np.std(aucs_text)
both.plot(mean_fpr_text, mean_tpr_text, color='g',
        label=r'Mean ROC of Texture Feature Set(AUC = %0.2f $\pm$ %0.2f)' % (mean_auc_text, std_auc_text),
        lw=2, alpha=.8)

std_tpr_text = np.std(tprs_text, axis=0)
tprs_upper_text = np.minimum(mean_tpr_text + std_tpr_text, 1)
tprs_lower_text = np.maximum(mean_tpr_text - std_tpr_text, 0)
both.fill_between(mean_fpr_text, tprs_lower_text, tprs_upper_text, color='green', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

both.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
both.legend(loc="lower right")

plt.show()

## Permutation Testing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
from imblearn.metrics import geometric_mean_score
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer

#Uses test 1 described here:
# http://www.jmlr.org/papers/volume11/ojala10a/ojala10a.pdf

# #############################################################################
n_classes = np.unique(Y).size

cv = StratifiedKFold(n_splits=5, random_state=12, shuffle=True)
xgb =  XGBClassifier(n_estimators=32, max_depth=3, scale_pos_weight=.2875)
metric=make_scorer(geometric_mean_score)

score, permutation_scores, pvalue = permutation_test_score(
    xgb,X, Y, scoring=metric, cv=cv, n_permutations=1000)

print("Classification score %s (pvalue : %s)" % (score, pvalue))

# #############################################################################
# View histogram of permutation scores
plt.figure(figsize=(12,6))
plt.hist(permutation_scores, 20, label='Permutation scores',
         edgecolor='black')
ylim = plt.ylim()

plt.plot(2 * [score], ylim, '--g', linewidth=3,
         label='Classification Score'
         ' (pvalue %s)' % pvalue)
plt.plot(2 * [1. /n_classes], ylim, '--k', linewidth=3, label='Luck')
#plt.plot(2 * [luck_new], ylim, '--k', linewidth=3, label='Luck')
plt.ylim(ylim)
plt.legend()
plt.xlabel('Score')
plt.show()

# Feature Selection

In [None]:
#Creating labels
x1 = df2
Y = df["mucinous"].copy();
feature_cols = x1.columns

#Scale values from 0 to 1
scaler = MinMaxScaler(feature_range=(0, 1))
X1 = scaler.fit_transform(x1)
print(X1.shape)

In [None]:
#most improtant feature function

def Important_fetures(mymodel,featuredict):
    import numpy as np
    import sklearn as sk
    import sklearn.datasets as skd
    import matplotlib.pyplot as plt
    %matplotlib inline

    importances = model.feature_importances_
    indice = np.argsort(importances)[::-1]
    indices = indice [:30]
    
    # Print the feature ranking
    # print("Feature ranking:")
    num=0
    with open(OUTPUT_LOCATION_OF_FEATURE_FILE, "w") as txt_file:
      for f in indices:
        indexname = f;
        num+=1;
        #print("%d. feature:  %s (%f)" % (num, feature_cols[indexname], importances[indexname]))
        if feature_cols[indexname] in featuredict:
          featuredict[feature_cols[indexname]][0] += 1
          featuredict[feature_cols[indexname]][1] += importances[indexname]
        else:
          featuredict[feature_cols[indexname]] = [1,importances[indexname]]

In [None]:
# K-fold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,LeaveOneOut
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
from statistics import mean
from xgboost import XGBClassifier

featuredict = {}

for x in range(1000):

        cv = StratifiedKFold(n_splits=5, shuffle=True)

        Precisons = []
        Recalls = []
        F1s = []
        G_means = []

        for train_fold_index, val_fold_index in cv.split(X,Y):
          X_train_fold,y_train_fold = X[train_fold_index], Y[train_fold_index]
          X_val_fold, y_val_fold = X[val_fold_index], Y[val_fold_index]
          #smoter = SMOTE()
          #X_train_fold_resample, y_train_fold_resample = smoter.fit_resample(X_train_fold,y_train_fold)
          model = XGBClassifier(n_estimators=8, max_depth=3, scale_pos_weight=.25)
          #model.fit(X_train_fold_resample,y_train_fold_resample)
          model.fit(X_train_fold,y_train_fold)
          pt = model.predict(X_val_fold)

          Important_fetures(model,featuredict)

          # print("confusion_matrix:")
          # print(confusion_matrix(y_val_fold,pt))
          Precisons.append(precision_score(y_val_fold,pt))
          Recalls.append(recall_score(y_val_fold,pt))
          F1s.append(f1_score(y_val_fold,pt))
          G_means.append(geometric_mean_score(y_val_fold,pt))

In [None]:
#List ranked by average
import operator
import collections
Avg = {}
Ocurr = {}
Tavg ={}
for key in featuredict:
  Avg[key] = [featuredict[key][1]/featuredict[key][0],featuredict[key][0]]
  Ocurr[key] = featuredict[key][0]
  Tavg[key] =  featuredict[key][1]/5000

AvgRank = sorted(Avg.items(),key=lambda kv: kv[1][0],reverse=True)
OcurrRank = sorted(Ocurr.items(),key=lambda x: x[1],reverse=True)
TavRank = sorted(Tavg.items(),key=lambda kv: kv[1],reverse=True)

sortedAvg = {}
for i in AvgRank:
  sortedAvg[i[0]] = [i[1][0],i[1][1]]

Ocuurpd = pd.DataFrame.from_dict(OcurrRank)
Avgdf = pd.DataFrame.from_dict(sortedAvg,orient='index',columns=['Avg.Value','Occurance'])
Tavdf = pd.DataFrame.from_dict(TavRank)
Ocuurpd.columns = ['Feature', 'Avg. Value']
Avgdf.to_csv('Average feature Importance.CSV');
Ocuurpd.to_csv('Occurance.CSV')
Tavdf.to_csv('TSC.CSV')

In [None]:
a = 0 
df2 = df
for (columnName, columnData) in df.iteritems():
  if columnName not in Avg:
    a +=1
    df2 = df2.drop(columnName, axis=1)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,LeaveOneOut
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
from statistics import mean
from xgboost import XGBClassifier

cv = StratifiedKFold(n_splits=5, random_state=12, shuffle=True)

Precisons = []
Recalls = []
F1s = []
G_means = []
accuracy = []


for train_fold_index, val_fold_index in cv.split(X1,Y):
  X_train_fold,y_train_fold = X1[train_fold_index], Y[train_fold_index]
  X_val_fold, y_val_fold = X1[val_fold_index], Y[val_fold_index]
  smoter = SMOTE(random_state=12)
  X_train_fold_resample, y_train_fold_resample = smoter.fit_resample(X_train_fold,y_train_fold)
  model = XGBClassifier(n_estimators=32, max_depth=3, scale_pos_weight=.2875)
  model.fit(X_train_fold_resample,y_train_fold_resample)
  pt = model.predict(X_val_fold)

  print("confusion_matrix:")
  print(confusion_matrix(y_val_fold,pt))
  Precisons.append(precision_score(y_val_fold,pt))
  Recalls.append(recall_score(y_val_fold,pt))
  F1s.append(f1_score(y_val_fold,pt))
  G_means.append(geometric_mean_score(y_val_fold,pt))
  accuracy.append(accuracy_score(y_val_fold,pt))

print('Precision: ',mean(Precisons))
print('Recall: ',mean(Recalls))
print('F1: ',mean(F1s))
print('G_mean: ',mean(G_means))
print('Accuracy: ',mean(accuracy))
print(AvgRank)