In [0]:
# This colab has the functions for cleaning, normalizing, and then performing RandomForest, KNN, and XGBoost on the CHO
# and CIFAR datasets.
# The main method is at the bottom, which prompts the user for a a choice of algorithm.

In [0]:
# load the Drive helper and mount
from google.colab import drive

# will prompt for authorization
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# import packages

import numpy as np
import random
from math import *

# for partitioning into training/test sets
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.datasets import make_classification
from sklearn import svm
# library for loading matlab files into python code
from scipy.io import loadmat

# libaries for evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics

import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score


def load_datasets():
  ############ Start with loading in and preparing the Cho dataset. ############
  # load the Cho dataset
  # put data into a matrix

  '''This is Angie's import lines.''' 
  cho_array = np.loadtxt('/content/drive/My Drive/cse347_project_data/cho.txt')
  
  '''This is Lauren's import lines.'''  
  #  cho_array = np.loadtxt('/content/drive/My Drive/Data_Mining/cho.txt')

  # remove the gene_id AND ground truth from Cho array
  new_cho_array = []
  for item in cho_array:
    new_cho_array.append(item)

  j = 0
  for i in new_cho_array:
    i = i[2:]
    new_cho_array[j] = i
    j += 1


  # normalize the data in each row using min/max method
  # normalize all values to be within [0, 1]
  new_max = 1
  new_min = 0
  j = 0
  for item in new_cho_array:
    max = item.max()
    min = item.min()

    k = 0
    for i in item:
      new_cho_array[j][k] = (((i - min)/(max - min)) * (new_max - new_min)) + new_min
      k += 1
    j += 1


  # save ground truth in an array
  ground_truth_array = []
  for item in cho_array:
    ground_truth_array.append(item[1])

  # convert the ground truth to integers
  index = 0
  for i in ground_truth_array:
    i = int(i)
    ground_truth_array[index] = i
    index += 1

  ############ Now, work on loading in and preparing the CIFAR dataset. ############

  # load CIFAR datasets
  # each row is a color image, we can reshape and plot
  # each image is 32x32x3  (3 for RGB)

  '''These are Angie's import lines.'''
  cifarBatch1 = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/data_batch_1.mat')
  cifarBatch2 = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/data_batch_2.mat')
  cifarBatch3 = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/data_batch_3.mat')
  cifarBatch4 = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/data_batch_4.mat')
  cifarBatch5 = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/data_batch_5.mat')
  cifarTestBatch = loadmat('/content/drive/My Drive/cse347_project_data/cifar-10-batches-mat/test_batch.mat')

  '''These are Lauren's import lines.'''
  #  cifarBatch1 = loadmat('/content/drive/My Drive/Data_Mining/data_batch_1.mat')
  #  cifarBatch2 = loadmat('/content/drive/My Drive/Data_Mining/data_batch_2.mat')
  #  cifarBatch3 = loadmat('/content/drive/My Drive/Data_Mining/data_batch_3.mat')
  #  cifarBatch4 = loadmat('/content/drive/My Drive/Data_Mining/data_batch_4.mat')
  #  cifarBatch5 = loadmat('/content/drive/My Drive/Data_Mining/data_batch_5.mat')
  #  cifarTestBatch = loadmat('/content/drive/My Drive/Data_Mining/test_batch.mat')

  # separate the batches into their image and label sets
  images_batch1 = cifarBatch1['data']
  labels_for_batch1 = cifarBatch1['labels']

  images_batch2 = cifarBatch2['data']
  labels_for_batch2 = cifarBatch2['labels']

  images_batch3 = cifarBatch3['data']
  labels_for_batch3 = cifarBatch3['labels']

  images_batch4 = cifarBatch4['data']
  labels_for_batch4 = cifarBatch4['labels']

  images_batch5 = cifarBatch5['data']
  labels_for_batch5 = cifarBatch5['labels']

  images_test_batch = cifarTestBatch['data']
  labels_for_test_batch = cifarTestBatch['labels']

  # put all of the batches together so we can run our own KFold
  # we don't know how the creators of the dataset partitioned the data,
    # so we want to do our own partitioning with KFold
  cifarImages = []
  for i in images_batch1:
    cifarImages.append(i)
  for i in images_batch2:
    cifarImages.append(i)
  for i in images_batch3:
    cifarImages.append(i)
  for i in images_batch4:
    cifarImages.append(i)
  for i in images_batch5:
    cifarImages.append(i)
  for i in images_test_batch:
    cifarImages.append(i)


  cifarLabels = []
  for i in labels_for_batch1:
    cifarLabels.append(i)
  for i in labels_for_batch2:
    cifarLabels.append(i)
  for i in labels_for_batch3:
    cifarLabels.append(i)
  for i in labels_for_batch4:
    cifarLabels.append(i)
  for i in labels_for_batch5:
    cifarLabels.append(i)
  for i in labels_for_test_batch:
    cifarLabels.append(i)

  return [new_cho_array, ground_truth_array, cifarImages, cifarLabels]


def Average(lst): 
    return sum(lst) / len(lst)







##################################################################################################################################
##################################################################################################################################
############################################ RANDOM FOREST FUNCTIONS #############################################################
##################################################################################################################################
##################################################################################################################################


def rf_cho(cho_data, cho_labels):
  # For CHO
  # use the KFold library to get the training and testing sets
  numpy_new_cho_array = np.array(cho_data)
  numpy_ground_truth_array = np.array(cho_labels)

  X = numpy_new_cho_array  # the data
  y = numpy_ground_truth_array  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 0, shuffle = True)
  kf.get_n_splits(X)

  #print(kf)

  # these arrays will in the end hold the averages from each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 0
  for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # use the random forest classifier here inside the iteration

    # Instantiate model with 100 decision trees
    model = RandomForestClassifier(n_estimators=100, random_state = 42)

    # Train the model on training data
    model.fit(X_train, y_train)

    # Use the model to predict on the test set
    predictions = model.predict(X_test)

    #print ("------  ------  ------  ------  ------")

    # ACCURACY
    accuracy_cho = accuracy_score(y_test, predictions)
    #print ("CHO accuracy iteration:", iteration, "=", accuracy_cho)
    accuracy_averages.append(accuracy_cho)

    # binarize the results here, not before hand
    binarized_y_test_cho = label_binarize(y_test, classes=[1, 2, 3, 4, 5])
    binarized_predictions_cho = label_binarize(predictions, classes=[1, 2, 3, 4, 5])
    n_classes_cho = binarized_y_test_cho.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cho = dict()
    tpr_cho = dict()
    roc_auc_cho = dict()
    for i in range(n_classes_cho):
      fpr_cho[i], tpr_cho[i], _ = roc_curve(binarized_y_test_cho[:, i], binarized_predictions_cho[:, i])
      roc_auc_cho[i] = auc(fpr_cho[i], tpr_cho[i])
    #print ("CHO auc iteration", iteration, "=", roc_auc_cho)
    # first, compute the average of this run
    sum_auc = 0
    for j in range(0, len(roc_auc_cho)):
      sum_auc += roc_auc_cho[j]
    avg_auc = sum_auc/len(roc_auc_cho)
    #print ("Cho average auc: ", avg_auc)
    # add this run's average to the list that will be averaged at the end of the 5 folds
    auc_averages.append(avg_auc)

    # RECALL
    precision, recall_cho, f_score, support = precision_recall_fscore_support(y_test, predictions,  average=None)
    #print ("CHO recall iteration", iteration, "=", recall_cho)
    avg_recall = Average(recall_cho)
    #print ("Cho average recall: ", avg_recall)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  # return the results of the various evaluation metrics
  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]


def rf_cifar(cifarImages, cifarLabels):
  # For CIFAR
  # use the KFold library to get the training and testing sets
  numpy_cifarImages = np.array(cifarImages)
  numpy_cifarLabels = np.array(cifarLabels)

  X_cifar = numpy_cifarImages  # the data
  y_cifar = numpy_cifarLabels  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 0, shuffle = True)
  kf.get_n_splits(X_cifar)

  #print(kf)

  # create the arrays that will hold the averages/values for each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 1
  for train_index_cifar, test_index_cifar in kf.split(X_cifar):
    #print ("splitting")
    X_train_cifar, X_test_cifar = X_cifar[train_index_cifar], X_cifar[test_index_cifar]
    y_train_cifar, y_test_cifar = y_cifar[train_index_cifar], y_cifar[test_index_cifar]

    # use the random forest classifier here inside the iteration

    # Instantiate model with 100 decision trees
    model = RandomForestClassifier(n_estimators=100, random_state = 42)

    # Train the model on training data
    model.fit(X_train_cifar, y_train_cifar)

    # Use the model to predict on the test set
    predictions_cifar = model.predict(X_test_cifar)

    #print ("------  ------  ------  ------  ------")

    # ACCURACY, not binarized
    accuracy_cifar = accuracy_score(y_test_cifar, predictions_cifar)
    #print ("CIFAR accuracy iteration:", iteration, "=", accuracy_cifar)
    accuracy_averages.append(accuracy_cifar)

    # binarize the results here, not before hand
    binarized_y_test_cifar = label_binarize(y_test_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    binarized_predictions_cifar = label_binarize(predictions_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    n_classes_cifar = binarized_y_test_cifar.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cifar = dict()
    tpr_cifar = dict()
    roc_auc_cifar = dict()
    for i in range(n_classes_cifar):
      fpr_cifar[i], tpr_cifar[i], _ = roc_curve(binarized_y_test_cifar[:, i], binarized_predictions_cifar[:, i])
      roc_auc_cifar[i] = auc(fpr_cifar[i], tpr_cifar[i])
    #print ("CIFAR auc iteration", iteration, "=", roc_auc_cifar)
    # find the average from the values of this fold
    sum_auc = 0
    for j in range(0, len(roc_auc_cifar)):
      sum_auc += roc_auc_cifar[j]
    avg_auc = sum_auc/len(roc_auc_cifar)
    #print ("Cho average auc: ", avg_auc)
    # add the average value to the array that will hold averaged values from all 5 folds
    auc_averages.append(avg_auc)

    # RECALL, not binarized
    precision, recall_cifar, f_score, support = precision_recall_fscore_support(y_test_cifar, predictions_cifar, average=None)
    #print ("CIFAR recall iteration", iteration, "=", recall_cifar)
    avg_recall = Average(recall_cifar)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]







##################################################################################################################################
##################################################################################################################################
###################################################### KNN FUNCTIONS #############################################################
##################################################################################################################################
##################################################################################################################################

def knn_cho(cho_data, cho_labels):
  # For CHO
  # use the KFold library to get the training and testing sets
  numpy_new_cho_array = np.array(cho_data)
  numpy_ground_truth_array = np.array(cho_labels)

  X = numpy_new_cho_array  # the data
  y = numpy_ground_truth_array  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 4, shuffle = True)
  kf.get_n_splits(X)

  #print(kf)

  # these arrays will in the end hold the averages from each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 0
  for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
   
    # use the knn classifier here inside the iteration
    model = RandomForestClassifier(n_estimators=100, random_state = 42)

    # Train the model on training data
    model = KNeighborsClassifier(n_neighbors=9)
    model.fit(X_train, y_train)

    # Use the model to predict on the test set
    predictions = model.predict(X_test)

    #print ("------  ------  ------  ------  ------")

    # ACCURACY
    accuracy_cho = accuracy_score(y_test, predictions)
    #print ("CHO accuracy iteration:", iteration, "=", accuracy_cho)
    accuracy_averages.append(accuracy_cho)

    # binarize the results here, not before hand
    binarized_y_test_cho = label_binarize(y_test, classes=[1, 2, 3, 4, 5])
    binarized_predictions_cho = label_binarize(predictions, classes=[1, 2, 3, 4, 5])
    n_classes_cho = binarized_y_test_cho.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cho = dict()
    tpr_cho = dict()
    roc_auc_cho = dict()
    for i in range(n_classes_cho):
      fpr_cho[i], tpr_cho[i], _ = roc_curve(binarized_y_test_cho[:, i], binarized_predictions_cho[:, i])
      roc_auc_cho[i] = auc(fpr_cho[i], tpr_cho[i])
    #print ("CHO auc iteration", iteration, "=", roc_auc_cho)
    # first, compute the average of this run
    sum_auc = 0
    for j in range(0, len(roc_auc_cho)):
      sum_auc += roc_auc_cho[j]
    avg_auc = sum_auc/len(roc_auc_cho)
    #print ("Cho average auc: ", avg_auc)
    # add this run's average to the list that will be averaged at the end of the 5 folds
    auc_averages.append(avg_auc)

    # RECALL
    precision, recall_cho, f_score, support = precision_recall_fscore_support(y_test, predictions,  average=None)
    #print ("CHO recall iteration", iteration, "=", recall_cho)
    avg_recall = Average(recall_cho)
    #print ("Cho average recall: ", avg_recall)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  # return the results of the various evaluation metrics
  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]


def knn_cifar(cifarImages, cifarLabels):
  # For CIFAR
  # use the KFold library to get the training and testing sets
  numpy_cifarImages = np.array(cifarImages)
  numpy_cifarLabels = np.array(cifarLabels)

  X_cifar = numpy_cifarImages  # the data
  y_cifar = numpy_cifarLabels  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 0, shuffle = True)
  kf.get_n_splits(X_cifar)

  #print(kf)

  # create the arrays that will hold the averages/values for each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 1
  for train_index_cifar, test_index_cifar in kf.split(X_cifar):
    #print ("splitting")
    X_train_cifar, X_test_cifar = X_cifar[train_index_cifar], X_cifar[test_index_cifar]
    y_train_cifar, y_test_cifar = y_cifar[train_index_cifar], y_cifar[test_index_cifar]

    # use the random forest classifier here inside the iteration
    model = KNeighborsClassifier(n_neighbors=5)

    # Train the model on training data
    model.fit(X_train_cifar, y_train_cifar)

    # Use the model to predict on the test set
    predictions_cifar = model.predict(X_test_cifar)

    #print ("------  ------  ------  ------  ------")

    # ACCURACY, not binarized
    accuracy_cifar = accuracy_score(y_test_cifar, predictions_cifar)
    #print ("CIFAR accuracy iteration:", iteration, "=", accuracy_cifar)
    accuracy_averages.append(accuracy_cifar)

    # binarize the results here, not before hand
    binarized_y_test_cifar = label_binarize(y_test_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    binarized_predictions_cifar = label_binarize(predictions_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    n_classes_cifar = binarized_y_test_cifar.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cifar = dict()
    tpr_cifar = dict()
    roc_auc_cifar = dict()
    for i in range(n_classes_cifar):
      fpr_cifar[i], tpr_cifar[i], _ = roc_curve(binarized_y_test_cifar[:, i], binarized_predictions_cifar[:, i])
      roc_auc_cifar[i] = auc(fpr_cifar[i], tpr_cifar[i])
    #print ("CIFAR auc iteration", iteration, "=", roc_auc_cifar)
    # find the average from the values of this fold
    sum_auc = 0
    for j in range(0, len(roc_auc_cifar)):
      sum_auc += roc_auc_cifar[j]
    avg_auc = sum_auc/len(roc_auc_cifar)
    #print ("Cho average auc: ", avg_auc)
    # add the average value to the array that will hold averaged values from all 5 folds
    auc_averages.append(avg_auc)

    # RECALL, not binarized
    precision, recall_cifar, f_score, support = precision_recall_fscore_support(y_test_cifar, predictions_cifar, average=None)
    #print ("CIFAR recall iteration", iteration, "=", recall_cifar)
    avg_recall = Average(recall_cifar)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]









##################################################################################################################################
##################################################################################################################################
###################################################### SVM FUNCTIONS #############################################################
##################################################################################################################################
##################################################################################################################################

def svm_cho(cho_data, cho_labels):
  # For CHO
  # use the KFold library to get the training and testing sets
  numpy_new_cho_array = np.array(cho_data)
  numpy_ground_truth_array = np.array(cho_labels)

  X = numpy_new_cho_array  # the data
  y = numpy_ground_truth_array  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 0, shuffle = True)
  kf.get_n_splits(X)

  #print(kf)

  # these arrays will in the end hold the averages from each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 0
  for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # use the SVM classifier here inside the iteration

    #creating the model and fitting to training data
    svc = svm.SVC( kernel='rbf', random_state=7).fit(X_train, y_train)
    #predicting values 
    y_pred = svc.predict(X_test)
    predictions = [round(value) for value in y_pred]

    #print ("------  ------  ------  ------  ------")

    # ACCURACY
    accuracy_cho = accuracy_score(y_test, predictions)
    #print ("CHO accuracy iteration:", iteration, "=", accuracy_cho)
    accuracy_averages.append(accuracy_cho)

    # binarize the results here, not before hand
    binarized_y_test_cho = label_binarize(y_test, classes=[1, 2, 3, 4, 5])
    binarized_predictions_cho = label_binarize(predictions, classes=[1, 2, 3, 4, 5])
    n_classes_cho = binarized_y_test_cho.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cho = dict()
    tpr_cho = dict()
    roc_auc_cho = dict()
    for i in range(n_classes_cho):
      fpr_cho[i], tpr_cho[i], _ = roc_curve(binarized_y_test_cho[:, i], binarized_predictions_cho[:, i])
      roc_auc_cho[i] = auc(fpr_cho[i], tpr_cho[i])
    #print ("CHO auc iteration", iteration, "=", roc_auc_cho)
    # first, compute the average of this run
    sum_auc = 0
    for j in range(0, len(roc_auc_cho)):
      sum_auc += roc_auc_cho[j]
    avg_auc = sum_auc/len(roc_auc_cho)
    #print ("Cho average auc: ", avg_auc)
    # add this run's average to the list that will be averaged at the end of the 5 folds
    auc_averages.append(avg_auc)

    # RECALL
    precision, recall_cho, f_score, support = precision_recall_fscore_support(y_test, predictions,  average=None)
    #print ("CHO recall iteration", iteration, "=", recall_cho)
    avg_recall = Average(recall_cho)
    #print ("Cho average recall: ", avg_recall)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  # return the results of the various evaluation metrics
  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]


def svm_cifar(cifarImages, cifarLabels):
  # For CIFAR
  # use the KFold library to get the training and testing sets
  numpy_cifarImages = np.array(cifarImages)
  numpy_cifarLabels = np.array(cifarLabels)

  X_cifar = numpy_cifarImages  # the data
  y_cifar = numpy_cifarLabels  # the labels

  # initialize the KFold
  kf = KFold(n_splits=5, random_state = 0, shuffle = True)
  kf.get_n_splits(X_cifar)
  
  # create the arrays that will hold the averages/values for each of the 5 folds
  accuracy_averages = []
  auc_averages = []
  recall_averages = []

  iteration = 1
  for train_index_cifar, test_index_cifar in kf.split(X_cifar):
 
    X_train_cifar, X_test_cifar = X_cifar[train_index_cifar], X_cifar[test_index_cifar]
    y_train_cifar, y_test_cifar = y_cifar[train_index_cifar], y_cifar[test_index_cifar]


    #creating the model and fitting to training data
    svc = svm.SVC( kernel='rbf', random_state=7)
    model = svc.fit(X_train_cifar, y_train_cifar.ravel())

    #predicting values
    y_pred_cifar = model.predict(X_test_cifar)
    predictions_cifar = [round(value) for value in y_pred_cifar]

    #print ("------  ------  ------  ------  ------")

    # ACCURACY, not binarized
    accuracy_cifar = accuracy_score(y_test_cifar, predictions_cifar)
    #print ("CIFAR accuracy iteration:", iteration, "=", accuracy_cifar)
    accuracy_averages.append(accuracy_cifar)

    # binarize the results here, not before hand
    binarized_y_test_cifar = label_binarize(y_test_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    binarized_predictions_cifar = label_binarize(predictions_cifar, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8 ,9])
    n_classes_cifar = binarized_y_test_cifar.shape[1]

    # Compute ROC curve and ROC area for each class
    fpr_cifar = dict()
    tpr_cifar = dict()
    roc_auc_cifar = dict()
    for i in range(n_classes_cifar):
      fpr_cifar[i], tpr_cifar[i], _ = roc_curve(binarized_y_test_cifar[:, i], binarized_predictions_cifar[:, i])
      roc_auc_cifar[i] = auc(fpr_cifar[i], tpr_cifar[i])
    #print ("CIFAR auc iteration", iteration, "=", roc_auc_cifar)
    # find the average from the values of this fold
    sum_auc = 0
    for j in range(0, len(roc_auc_cifar)):
      sum_auc += roc_auc_cifar[j]
    avg_auc = sum_auc/len(roc_auc_cifar)
    #print ("Cho average auc: ", avg_auc)
    # add the average value to the array that will hold averaged values from all 5 folds
    auc_averages.append(avg_auc)

    # RECALL, not binarized
    precision, recall_cifar, f_score, support = precision_recall_fscore_support(y_test_cifar, predictions_cifar, average=None)
    #print ("CIFAR recall iteration", iteration, "=", recall_cifar)
    avg_recall = Average(recall_cifar)
    recall_averages.append(avg_recall)

    iteration += 1

  overall_accuracy = Average(accuracy_averages)
  std_accuracy = np.std(accuracy_averages)
  #print ("total accuracy: ", overall_accuracy)
  overall_auc = Average(auc_averages)
  std_auc = np.std(auc_averages)
  #print ("total auc: ", overall_auc)
  overall_recall = Average(recall_averages)
  std_recall = np.std(recall_averages)
  #print ("total recall: ", overall_recall)

  return [overall_accuracy, std_accuracy, overall_auc, std_auc, overall_recall, std_recall]


##################################################################################################################################
##################################################################################################################################
############################################ CALLS TO THE DIFFERENT ALGORITHMS ###################################################
##################################################################################################################################
##################################################################################################################################



def random_forest():
  # get the cho and CIFAR data sets
  # datasets = [cho_data, cho_labels, cifarImages, cifarLabels]
  # these datasets will have been cleaned/normalized as appropriate
  datasets = load_datasets()
  cho_data = datasets[0]
  cho_labels = datasets[1]
  cifarImages = datasets[2]
  cifarLabels = datasets[3]

  print ("Calculating for Cho...")
  # call the random forrest method for Cho
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_rf_cho = rf_cho(cho_data, cho_labels)

  # print the results for Cho:
  print ()
  print ("\n\nThe accuracy of the Cho dataset:")
  print (results_rf_cho[0])
  print ("The standard deviation for accuracy:")
  print (results_rf_cho[1])
  print ()
  print ("The AUC of the Cho dataset:")
  print (results_rf_cho[2])
  print ("The standard deviation for AUC:")
  print (results_rf_cho[3])
  print ()
  print ("The recall of the Cho dataset:")
  print (results_rf_cho[4])
  print ("The standard deviation for recall:")
  print (results_rf_cho[5])
  print ()

  print ()
  print ("Calculating for CIFAR...")
  print ()

  # call the random forrest method for CIFAR
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_rf_cifar = rf_cifar(cifarImages, cifarLabels)

  # print the results for CIFAR
  print ()
  print ("\n\nThe accuracy of the Cho dataset:")
  print (results_rf_cifar[0])
  print ("The standard deviation for accuracy:")
  print (results_rf_cifar[1])
  print ()
  print ("The AUC of the Cho dataset:")
  print (results_rf_cifar[2])
  print ("The standard deviation for AUC:")
  print (results_rf_cifar[3])
  print ()
  print ("The recall of the Cho dataset:")
  print (results_rf_cifar[4])
  print ("The standard deviation for recall:")
  print (results_rf_cifar[5])
  print ()



def KNN():
  # get the cho and CIFAR data sets
  # datasets = [cho_array, cifarImages, cifarLabels]
  # these datasets will have been cleaned/normalized as appropriate
  datasets = load_datasets()
  cho_data = datasets[0]
  cho_labels = datasets[1]
  cifarImages = datasets[2]
  cifarLabels = datasets[3]

  print ("Calculating for Cho...")
  # call the random forrest method for Cho
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_knn_cho = knn_cho(cho_data, cho_labels)

  # print the results for Cho:
  print ()
  print ("\n\nThe accuracy of the Cho dataset:")
  print (results_knn_cho[0])
  print ("The standard deviation for accuracy:")
  print (results_knn_cho[1])
  print ()
  print ("The AUC of the Cho dataset:")
  print (results_knn_cho[2])
  print ("The standard deviation for AUC:")
  print (results_knn_cho[3])
  print ()
  print ("The recall of the Cho dataset:")
  print (results_knn_cho[4])
  print ("The standard deviation for recall:")
  print (results_knn_cho[5])
  print ()


  print ("Calculating for CIFAR...")
  print ()
  # call the random forrest method for CIFAR
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_knn_cifar = knn_cifar(cifarImages, cifarLabels)

  # print the results for CIFAR
  print ()
  print ("\n\nThe accuracy of the CIFAR dataset:")
  print (results_knn_cifar[0])
  print ("The standard deviation for accuracy: ")
  print (results_knn_cifar[1])
  print ()
  print ("The AUC of the CIFAR dataset: ")
  print (results_knn_cifar[2])
  print ("The standard deviation for AUC:")
  print (results_knn_cifar[3])
  print ()
  print ("The recall of the CIFAR dataset: ")
  print (results_knn_cifar[4])
  print ("The standard deviation for recall:")
  print (results_knn_cifar[5])
  print ()



def SVM():
  # get the cho and CIFAR data sets
  # datasets = [cho_data, cho_labels, cifarImages, cifarLabels]
  # these datasets will have been cleaned/normalized as appropriate
  datasets = load_datasets()
  cho_data = datasets[0]
  cho_labels = datasets[1]
  cifarImages = datasets[2]
  cifarLabels = datasets[3]

  print ("Calculating for Cho...")
  # call the random forrest method for Cho
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_svm_cho = svm_cho(cho_data, cho_labels)

  # print the results for Cho:
  print ()
  print ("\n\nThe accuracy of the Cho dataset:")
  print (results_svm_cho[0])
  print ("The standard deviation for accuracy:")
  print (results_svm_cho[1])
  print ()
  print ("The AUC of the Cho dataset:")
  print (results_svm_cho[2])
  print ("The standard deviation for AUC:")
  print (results_svm_cho[3])
  print ()
  print ("The recall of the Cho dataset:")
  print (results_svm_cho[4])
  print ("The standard deviation for recall:")
  print (results_svm_cho[5])
  print ()

  print ()
  print ("Calculating for CIFAR...")
  print ()

  # call the random forrest method for CIFAR
  # returns [overall_accuracy, overall_auc, overall_recall]
  results_svm_cifar = svm_cifar(cifarImages, cifarLabels)

  # print the results for CIFAR
  print ()
  print ("\n\nThe accuracy of the Cho dataset:")
  print (results_svm_cifar[0])
  print ("The standard deviation for accuracy:")
  print (results_svm_cifar[1])
  print ()
  print ("The AUC of the Cho dataset:")
  print (results_svm_cifar[2])
  print ("The standard deviation for AUC:")
  print (results_svm_cifar[3])
  print ()
  print ("The recall of the Cho dataset:")
  print (results_svm_cifar[4])
  print ("The standard deviation for recall:")
  print (results_svm_cifar[5])
  print ()



##################################################################################################################################
##################################################################################################################################
######################################################   MAIN METHOD    ##########################################################
##################################################################################################################################
##################################################################################################################################
                                                    
# ask the user what method they want to use, either:
  # Random Forest
  # KNN
  # SVM

loop = 1
while (loop):
  method = input("Enter r for random forest or k for KNN or s for SVM:\n")
  if (method == 'r'):
    loop = 0
    # call Random Forest function
    random_forest()
  
  elif (method == 'k'):
    loop = 0
    # call KNN function
    KNN()

  elif (method == 's'):
    loop = 0
    # call SVM function
    SVM()

  else:
    print ("That is not an option.")

Calculating for Cho...



The accuracy of the Cho dataset:
0.7382950382950384
The standard deviation for accuracy:
0.021286093952511305

The AUC of the Cho dataset:
0.8282064465265668
The standard deviation for AUC:
0.012202015198283728

The recall of the Cho dataset:
0.7248322768621958
The standard deviation for recall:
0.019762121229298154

Calculating for CIFAR...








The accuracy of the CIFAR dataset:
0.33875
The standard deviation for accuracy: 
0.004277525245071298

The AUC of the CIFAR dataset: 
0.6327039802311446
The standard deviation for AUC:
0.0015841486437533556

The recall of the CIFAR dataset: 
0.3388772107490549
The standard deviation for recall:
0.0027481521259026284

