<a href="https://colab.research.google.com/github/tachidok/multifractal_breaKHist/blob/main/Multifractal_based_on_slide_ids.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cancer cells classification based on multifractal dimension

# Install non default packages

In [None]:
!pip uninstall scikit-learn -y
!pip install -U scikit-learn
#!pip install -U lazypredict

#Run this if running using local environment

In [None]:
pip install -U imblearn

# Import packages

In [None]:
import os
import sys

# Basic plotting
import numpy as np
#import cv2
import pylab as pl
from matplotlib import pyplot as plt

# Basic data base management and statistical tools
import pandas as pd
#import seaborn as sns

# Classification tools
from sklearn.model_selection import train_test_split

# Classification models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeaveOneOut

# Metrics and tools
from sklearn import preprocessing
# Metrics (accuracy, precision, recall, F_beta, AUC)
from sklearn import metrics
# Get the time for each classifier
from time import time

# Manage unbalance data
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from collections import Counter

# Visualization
from sklearn import tree

# AutoML
#import autosklearn.classification
#from lazypredict.Supervised import LazyClassifier

# Load data

## Load from local device (not used in colab)

In [None]:
#df = pd.read_csv('../data/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/02_csv_06052021/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/03_csv_files_bc_p23/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/04_csv_files_bc_p456/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/05_csv_files_seg_bc234/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/06_csv_files_heq_seg_bc234_log/csvfile_all.csv')
#df = pd.read_csv('../data/csv_files/07_csv_files_heq_segthadapt_bc234_log/csvfile_all.csv')

## Load to Google Drive from local file system using Python code (not used in colab)

In [None]:
#from google.colab import files
#uploaded = files.upload()

### Load data

In [None]:
#df = pd.read_csv('csvfile_all.csv')
df = pd.read_csv('./dataset/csv_files/06_csv_files_heq_seg_bc_234_log/csvfile_all.csv')

## Load from Google Drive (used in colab)

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

### Where are we? (current path)

In [None]:
!pwd

### Load data from Google Drive

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/dataset/cancer/csv_files/06_csv_files_heq_seg__bc_234_log/csvfile_all.csv')

# Output dataframe

In [None]:
# Output the data frame
df

# Pre-process data frames
1. Get the number of registers prior to pre-processing
2. Drop 'NaN' entries
3. Drop duplicates
4. Get the final number of registers post pre-processing

In [None]:
# Get the number of registers
n_registers_prior_preprocessing = df.shape
# Drop 'NaN' entries
df = df.dropna()
# Drop duplicates
df = df.drop_duplicates()
# Drop the column called unnamed, do the operation "inplace", that means in the same DataFrame, and remove the "axis=1", that means the column, not the row
#df.drop("0", inplace = True, axis = 1)#disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=estimator.classes_)
#disp.plot()
#plt.title("Confusion matrix")
#plt.figure(figsize=(10,10))
#plt.show()
#df = df.reset_index()
n_registers_post_preprocessing = df.shape
print(f"Number of registers prior pre-processing {n_registers_prior_preprocessing}")
print(f"Number of registers post pre-processing {n_registers_post_preprocessing}")

# Get information about the dataset
* Number of samples for each tumor class or tumor type

* Tumor type

  * A = Adenosis
  * F = Fibroadenoma
  * TA = Tubular Adenoma
  * PT = Phyllodes Tumor

  * DC = Ductal Carcicoma
  * LC = Lobular Carcicoma
  * MC = Mucinous Carcicoma (Colloid)
  * PC = Papillary Carcicoma

* Tumor class

  * B = Benign
  * M = Malign


In [None]:
# Count the number of instances of each tumor type
print("Full dataset: Number of instances for each tumor type")
print(df['tumor_type'].value_counts())

# Count the number of instances of each tumor class
print("\nFull dataset: Number of instances for each tumor class")
print(df['tumor_class'].value_counts())

# Create groups based on magnification
groups_by_magnification = df.groupby('magnification')

# Get the groups by magnification
group_40x = groups_by_magnification.get_group(40)
group_100x = groups_by_magnification.get_group(100)
group_200x = groups_by_magnification.get_group(200)
group_400x = groups_by_magnification.get_group(400)

# Describe the sub-dataset
print("\n\n40x dataset: describe()")
print(group_40x['tumor_type'].describe())
# Get the number of instances of each tumor type for this magnification
print("\n40x dataset: Number of instances for each tumor type")
print(group_40x['tumor_type'].value_counts())
# Get the number of instances of each tumor class for this magnification
print("\n40x dataset: Number of instances for each tumor class")
print(group_40x['tumor_class'].value_counts())

# Describe the sub-dataset
print("\n\n100x dataset: describe()")
print(group_100x['tumor_type'].describe())
# Get the number of instances of each tumor type for this magnification
print("\n100x dataset: Number of instances for each tumor type")
print(group_100x['tumor_type'].value_counts())
# Get the number of instances of each tumor class for this magnification
print("\n100x dataset: Number of instances for each tumor class")
print(group_100x['tumor_class'].value_counts())

# Describe the sub-dataset
print("\n\n200x dataset: describe()")
print(group_200x['tumor_type'].describe())
# Get the number of instances of each tumor type for this magnification
print("\n200x dataset: Number of instances for each tumor type")
print(group_200x['tumor_type'].value_counts())
# Get the number of instances of each tumor class for this magnification
print("\n200x dataset: Number of instances for each tumor class")
print(group_200x['tumor_class'].value_counts())

# Describe the sub-dataset
print("\n\n400x dataset: describe()")
print(group_400x['tumor_type'].describe())
# Get the number of instances of each tumor type for this magnification
print("\n400x dataset: Number of instances for each tumor type")
print(group_400x['tumor_type'].value_counts())
# Get the number of instances of each tumor class for this magnification
print("\n400x dataset: Number of instances for each tumor class")
print(group_400x['tumor_class'].value_counts())

# Plot data as it is!!!

## Print the name of the columns

In [None]:
print(df.columns)

## Plot Image vs fd_canny_100_200

In [None]:
ax1 = df.plot.scatter(x='index', y='fd_canny_100_200')

## Plot Image vs fd_canny_150_250_as5

In [None]:
ax2 = df.plot.scatter(x='index', y='fd_canny_150_250_as5')

## Plot fd_canny_100_200 vs fd_canny_150_250_as5

In [None]:
ax3 = df.plot.scatter(x='fd_canny_100_200', y='fd_canny_150_250_as5')
#df.plot.scatter(x='index',y='magnification')

# Strategy to create the new data frame

1.   Extract the slide ID/patient ID and add it into the data frame
2.   Split observations based on patients IDs/slides IDs
3.   Split the patients sub-groups per magninfication
4.   Create as many combinations as possible (memory constraints) by taking one element of each magnification sub-group
5.   For each combination assign a tumor type and tumor class based on the patient IDs/slides IDs labels

## Extract the slide ID/patient ID from the image name and add it as a new column to the data set

In [None]:
# Get the image name column into a numpy array
names_array = df['image_name'].to_numpy()
# Get the number of elements in the column
n_names = len(names_array)
# Create an empty list
list_slide_ids = []#np.empty(n_names)
#list_figure_number = []#np.empty(n_names)

# Loop over all images
for i in range(n_names):

  # Get the full name of the image
  image_name = names_array[i]

  # Split the name by '-'
  image_features = image_name.split("-")

  # Append the element at position 2 (which is the slide id)
  list_slide_ids.append(image_features[2])

  # Get the element at position 4 which is the figure number with the image extension [png], then get only the number and forget about the extension
  #list_figure_number.append(image_features[4].split(".")[0])

  #print("Names: %s %s" % (list_slide_ids[i], list_figure_number[i]))

# Add the column to the data frame
df['slide_id'] = list_slide_ids
#df['figure_number'] = list_figure_number

# Show the data frame
df

## Drop columns and create a new data frame from groups

1. Drop 'path' and 'image_name' columns
2. Create groups by slide ids, then create subgroups by magnification and combine data into a single observation

 * Each row or observation in the data frame has the following structure

    * Canny feature for 40x - 01
    * Canny feature for 40x - 02
    * Canny feature for 100x - 01
    * Canny feature for 100x - 02
    * Canny feature for 200x - 01
    * Canny feature for 200x - 02
    * Canny feature for 400x - 01
    * Canny feature for 400x - 02
    * Tumor class
    * Tumor type

  The variable 'n_max_ele_per_group' constraint the number of images to take for each magnification group. This is used due to memory constraints.

In [None]:
# Rewrite this section by using the itertools package that implements the cartesian product of multiple sets

# https://www.geeksforgeeks.org/python-itertools-product/

# Sort the data frame by slide ids
#sorted_df = df.sort_values('slide_id', ascending=False)
# Drop the path of the image and the full image name
reduced_df = df.drop(columns=['path', 'image_name'])
# Create groups by the slide ids
group_df = reduced_df.groupby('slide_id')

# The list to store the new data (get both canny features for each magnification)
list_magnification_40x_canny_01 = []
list_magnification_40x_canny_02 = []
list_magnification_100x_canny_01 = []
list_magnification_100x_canny_02 = []
list_magnification_200x_canny_01 = []
list_magnification_200x_canny_02 = []
list_magnification_400x_canny_01 = []
list_magnification_400x_canny_02 = []
# Also include the tumor class and the tumor type
list_tumor_class = []
list_tumor_type = []

counter = 0

# List the groups
for slide_id, group in group_df:

  print(f"Slide id: {slide_id}")
  print("Patient number: %d" % counter)
  counter += 1

  # Create groups based on magnification
  sub_groups_by_magnification = group.groupby('magnification')

  # Get the groups by magnification
  group_40x = sub_groups_by_magnification.get_group(40)
  group_100x = sub_groups_by_magnification.get_group(100)
  group_200x = sub_groups_by_magnification.get_group(200)
  group_400x = sub_groups_by_magnification.get_group(400)

  # Get a numpy representation of the columns with the canny features
  canny_values_01_40x = group_40x['fd_canny_100_200'].to_numpy()
  canny_values_02_40x = group_40x['fd_canny_150_250_as5'].to_numpy()
  canny_values_01_100x = group_100x['fd_canny_100_200'].to_numpy()
  canny_values_02_100x = group_100x['fd_canny_150_250_as5'].to_numpy()
  canny_values_01_200x = group_200x['fd_canny_100_200'].to_numpy()
  canny_values_02_200x = group_200x['fd_canny_150_250_as5'].to_numpy()
  canny_values_01_400x = group_400x['fd_canny_100_200'].to_numpy()
  canny_values_02_400x = group_400x['fd_canny_150_250_as5'].to_numpy()

  # Get a numpy representation of the columns with the tumor class and tumor type, all the tumor class and tumor type are the same
  tumor_class_np = group_40x['tumor_class'].to_numpy()
  tumor_type_np = group_40x['tumor_type'].to_numpy()

  # Get the number of images per group
  n_images_40x = len(canny_values_01_40x)
  n_images_100x = len(canny_values_01_100x)
  n_images_200x = len(canny_values_01_200x)
  n_images_400x = len(canny_values_01_400x)

  # Print the number of images per each magnification
  print(f"Number of images per 40x {n_images_40x}")
  print(f"Number of images per 100x {n_images_100x}")
  print(f"Number of images per 200x {n_images_200x}")
  print(f"Number of images per 400x {n_images_400x}")

  # Set a maximum number of images to consider per group (this is due to memory limitations)
  n_max_img_per_group = 10

  # Create a new data frame with the combinations of the canny values of the images
  # [canny_40x, canny_40x, canny_100x, canny_100x, canny_200x, canny_200x, canny_400x, canny_400x, tumor_class, tumor_type]
  for i in range(min(n_images_40x, n_max_img_per_group)):
    for j in range(min(n_images_100x, n_max_img_per_group)):
      for k in range(min(n_images_200x, n_max_img_per_group)):
        for m in range(min(n_images_400x, n_max_img_per_group)):

          feature_canny_01_40x = canny_values_01_40x[i]
          feature_canny_02_40x = canny_values_02_40x[i]
          feature_canny_01_100x = canny_values_01_100x[j]
          feature_canny_02_100x = canny_values_02_100x[j]
          feature_canny_01_200x = canny_values_01_200x[k]
          feature_canny_02_200x = canny_values_02_200x[k]
          feature_canny_01_400x = canny_values_01_400x[m]
          feature_canny_02_400x = canny_values_02_400x[m]

          #print(f"Slide id {name}, m40x {feature_canny_01_40x} {feature_canny_02_40x}, m100x {feature_canny_01_100x} {feature_canny_02_100x}, m200x {feature_canny_01_200x} {feature_canny_02_200x}, m400x {feature_canny_01_400x} {feature_canny_02_400x}")
          list_magnification_40x_canny_01.append(feature_canny_01_40x)
          list_magnification_40x_canny_02.append(feature_canny_02_40x)
          list_magnification_100x_canny_01.append(feature_canny_01_100x)
          list_magnification_100x_canny_02.append(feature_canny_02_100x)
          list_magnification_200x_canny_01.append(feature_canny_01_200x)
          list_magnification_200x_canny_02.append(feature_canny_02_200x)
          list_magnification_400x_canny_01.append(feature_canny_01_400x)
          list_magnification_400x_canny_02.append(feature_canny_02_400x)

          # We take the tumor type and class from the 40x image data set, it should be the same for all magnifications
          list_tumor_class.append(tumor_class_np[i])
          list_tumor_type.append(tumor_type_np[i])


## Create the new data frame

In [None]:
# Create the data frame with the new combined data
#new_df = pd.DataFrame([list_magnification_40x_canny_01, list_magnification_40x_canny_02, list_magnification_100x_canny_01, list_magnification_100x_canny_02, list_magnification_200x_canny_01, list_magnification_200x_canny_02, list_magnification_400x_canny_01, list_magnification_400x_canny_02, list_tumor_class, list_tumor_type], columns = ['canny_values_01_40x', 'canny_values_02_40x', 'canny_values_01_100x', 'canny_values_02_100x', 'canny_values_01_200x', 'canny_values_02_200x', 'canny_values_01_400x', 'canny_values_02_400x', 'tumor_class', 'tumor_type'])
new_df = pd.DataFrame()

new_df['canny_values_01_40x'] = list_magnification_40x_canny_01
new_df['canny_values_02_40x'] = list_magnification_40x_canny_02
new_df['canny_values_01_100x'] = list_magnification_100x_canny_01
new_df['canny_values_02_100x'] = list_magnification_100x_canny_02
new_df['canny_values_01_200x'] = list_magnification_200x_canny_01
new_df['canny_values_02_200x'] = list_magnification_200x_canny_02
new_df['canny_values_01_400x'] = list_magnification_400x_canny_01
new_df['canny_values_02_400x'] = list_magnification_400x_canny_02
new_df['tumor_class'] = list_tumor_class
new_df['tumor_type'] = list_tumor_type

new_df

# Get information about the new dataset
* Number of instances per tumor class and tumor type

In [None]:
# Describe the dataset
print("\n\nNew dataset: describe()")
print(new_df['tumor_type'].describe())
# Get the number of instances of each tumor type
print("\nNew dataset: Number of instances for each tumor type")
print(new_df['tumor_type'].value_counts())
# Get the number of instances of each tumor class
print("\nNew dataset: Number of instances for each tumor class")
print(new_df['tumor_class'].value_counts())

# Save the new data frame to disk (not run due to large file size)

In [None]:
# Save the data frame
#new_df.to_csv(rf'csvfile_new_df_grp_max_20.csv')

# Classification

## Data selection
* (Option 1) Tumor type

  * A = Adenosis
  * F = Fibroadenoma
  * TA = Tubular Adenoma
  * PT = Phyllodes Tumor

  * DC = Ductal Carcicoma
  * LC = Lobular Carcicoma
  * MC = Mucinous Carcicoma (Colloid)
  * PC = Papillary Carcicoma

* (Option 2) Tumor class

  * B = Benign
  * M = Malign

### Option 1: Remove the tumor class column


In [None]:
# Drop the tumor class column
#new_df = new_df.drop(columns=['tumor_class'])
#new_df

### Option 2: Remove the tumor type column



In [None]:
# Drop the tumor class column
new_df = new_df.drop(columns=['tumor_type'])
new_df

## Get information again
* Number of tumor type/class in the dataset

In [None]:
# Describe the dataset
#print("\n\nNew dataset: describe()")
#print(new_df['tumor_type'].describe())
# Get the number of instances of each tumor type
#print("\nNew dataset: Number of instances for each tumor type")
#print(new_df['tumor_type'].value_counts())

In [None]:
# Describe the dataset
print("\n\nNew dataset: describe()")
#print(new_df['tumor_class'].describe())
# Get the number of instances of each tumor class
print("\nNew dataset: Number of instances for each tumor class")
print(new_df['tumor_class'].value_counts())

## Split data intro training and testing groups

* X stores all observations
* y stores all predictions

* X_train stores training group observations
* y_train stores training group predictions

* X_test stores testing group observations
* y_test stores testing group predictions

In [None]:
# Get the observations/input values from the new data frame
#X = np.c_[new_df[['canny_values_01_40x', 'canny_values_02_40x', 'canny_values_01_100x', 'canny_values_02_100x', 'canny_values_01_200x', 'canny_values_02_200x', 'canny_values_01_400x', 'canny_values_02_400x']]]
# Get the predictions (from tge "tumor class" or the "tumor type" column)
#y = np.c_[new_df['tumor_class']]
#y = np.c_[new_df['tumor_type']]

X = new_df[['canny_values_01_40x', 'canny_values_02_40x', 'canny_values_01_100x', 'canny_values_02_100x', 'canny_values_01_200x', 'canny_values_02_200x', 'canny_values_01_400x', 'canny_values_02_400x']]
# Get the predictions (from tge "tumor class" or the "tumor type" column)
y = new_df['tumor_class']
#y = new_df['tumor_type']

# Split the data
percentage_for_test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = percentage_for_test_size, shuffle = True)
print("Full dataset size")
print(X.shape)
print(y.shape)

print("\nTraining dataset size")
print(X_train.shape)
print(y_train.shape)
print("\nTesting dataset size")
print(X_test.shape)
print(y_test.shape)

## Pre-processing

### Scaling (training and testing data)

In [None]:
#scaler = preprocessing.MinMaxScaler().fit(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Correct the training vector shape

In [None]:
# Correct the issue of size (numpy format)
y_train = np.ravel(y_train)

### Manage unbalanced data set

In [None]:
# Print data prior unbalance management
print("Before unbalance management", Counter(y_train))

# Instantiate sampler strategy
undersample = RandomUnderSampler(sampling_strategy='majority')

# Apply strategy
X_train_balanced, y_train_balanced = undersample.fit_resample(X_train_scaled, y_train)

# Print data after unbalance management
print("After unbalance management", Counter(y_train_balanced))

print("\nTraining dataset size")
print(X_train_balanced.shape)
print(y_train_balanced.shape)
print("\nTesting dataset size")
print(X_test_scaled.shape)
print(y_test.shape)

## Compute feature importance
* Use a decision tree to get the importance of each feature on classification
* Use all dataset since we want to know the feature importance

In [None]:
# Instance the decision tree
tree = DecisionTreeClassifier(max_depth = 10, random_state=0)
# Fit with the whole dataset to get feature importance
tree.fit(X, y)
tree.score(X, y)

In [None]:
# Get a graph with the importance of each feature
%matplotlib inline
pd.Series(tree.feature_importances_, index = X.columns).plot.barh(figsize=(25, 10));

## Proceed to classification (processing)

### Dummy classifier
* Create a dummy classifier to get the worst cases to compare with
* It is expected that the tested classifiers get better results that the dummy one
* Use different strategies for the dummy **classifier**

In [None]:
for strategy in ['most_frequent', 'stratified', 'prior', 'uniform']:
  dummy = DummyClassifier(strategy = strategy, random_state = 0)
  # Fit
  start = time()
  dummy.fit(X_train_balanced, y_train_balanced)
  train_time = time() - start
  # Compute accuracy and prediction time
  start = time()
  score = dummy.score(X_test_scaled, y_test)
  predict_time = time() - start
  # Print results
  print("{:<15}| score = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(strategy, score, train_time, predict_time))

### Logistic regression

In [None]:
#   ****************Logistic Regression*****************
#estimator = LogisticRegression(random_state=0, solver='liblinear', n_jobs=-1)
estimator = LogisticRegression(random_state=0, solver='sag', n_jobs=-1)
#estimator = LogisticRegression(random_state=0, solver='lbfgs', n_jobs=-1, verbose=1)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
#print("Precision score of logistic regression classifier :: " , metrics.precision_score(y_test, y_pred, average=None, labels=logReg.classes_))
#print("Recall score of logistic regression classifier :: " , metrics.recall_score(y_test, y_pred, average=None, labels=logReg.classes_))
#print("F1 score of logistic regression classifier :: " , metrics.f1_score(y_test, y_pred, average=None, labels=logReg.classes_))

#print("ROC curve score of logistic regression classifier :: " , metrics.roc_score(y_test, y_pred, average=None, labels=logReg.classes_))

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=estimator.classes_)
#disp.plot()
#plt.title("Confusion matrix")
#plt.figure(figsize=(10,10))
#plt.show()

#%sklearn inline
#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test_scaled, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
#fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score)
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
#metrics.RocCurveDisplay.from_predictions(y_test, y_pred, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(20,20))
plt.show()

# Check the section for multiclass
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
# For displaying ROC curve
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html#sklearn.metrics.RocCurveDisplay.from_estimator
# ROC curve with cross-validation
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test_scaled))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)

print("\nArea Under Curve (AUC) = {:.4f}".format(auc))
# Check examples for multiclass
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html#sklearn.metrics.roc_auc_score

#### Cross-validation for logistic regression

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### Multilayer Perceptron

In [None]:
#   ****************Multi-layer Perceptron*****************
#estimator = MLPClassifier(alpha=1)
#estimator = MLPClassifier(alpha=0.0001, solver='adam', verbose=True)
#estimator = MLPClassifier(alpha=0.0001, solver='adam', verbose=True, early_stopping=True)
estimator = MLPClassifier(alpha=0.001, tol=1e-3, solver='adam', verbose=True, early_stopping=True)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
fig1, ax1 = plt.subplots()
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()
fig1.savefig("MLP.png")

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

#### Cross-validation for Multilayer-Perceptron

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### Naive-Bayes

In [None]:
#   ****************Naive-Bayes*****************
estimator = GaussianNB()

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

#### Cross-validation for Naive-Bayes

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### Gaussian Process

In [None]:
#   ****************Gaussian Process classifier*****************
estimator = GaussianProcessClassifier(max_iter_predict=10, random_state=0, n_jobs=-1)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

#### Cross-validation for Gaussian-Process

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### SVM

In [None]:
#   ****************SVM 3th grade polynomial Kernel*****************
#estimator = SVC(kernel="poly", verbose=True)
estimator = SVC(kernel="rbf", verbose=True)
#estimator = SVC(kernel="linear", verbose=True)
#estimator = SVC(kernel="linear", gamma=0.1, verbose=True)
#estimator = SVC(kernel="linear", gamma=0.001, verbose=True)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

#### Cross-validation for SVM

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### Decision tree

In [None]:
#   ****************Decision Tree*****************
#estimator = DecisionTreeClassifier(max_depth=3, random_state=0)
#estimator = DecisionTreeClassifier(random_state=0)
estimator = DecisionTreeClassifier(max_depth=20, random_state=0)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

In [None]:
#print(f"Classes: {estimator.classes_}")
#print(f"Feature importance: {estimator.feature_importances_}")
#print(f"Max. features: {estimator.max_features}")
#print(f"Number of classes: {estimator.n_classes_}")
#print(f"Number of features: {estimator.n_features_}")
#print(f"Number of outputs: {estimator.n_outputs_}")

#print(f"Depth: {estimator.get_depth()}")
#print(f"Number of leaves: {estimator.get_n_leaves()}")

In [None]:
#tree.plot_tree(decTree)
#plt.show()

#### Cross-validation for Decision Tree

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### Random forest

In [None]:
#   ****************Random Forest*****************
#estimator = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1)
estimator = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=0, n_jobs=-1, verbose=2)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

In [None]:
#print(f"Classes: {rf.classes_}")
#print(f"Feature importance: {rf.feature_importances_}")
#print(f"Max. features: {rf.max_features}")
#print(f"Number of classes: {rf.n_classes_}")
#print(f"Number of features: {rf.n_features_}")
#print(f"Number of outputs: {rf.n_outputs_}")

#print(f"Base estimator: {rf.base_estimator_}")
##print(f"Estimators: {rf.estimators_}")
#for estimator in rf.estimators_:
    #print(f"Estimator: {estimator}")

#### Cross-validation for Random Forest

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

### K Neighbors

In [None]:
#   ****************K Neighbors classifier*****************
#estimator = KNeighborsClassifier(n_neighbors = 3, weights = 'distance', n_jobs = 4)
estimator = KNeighborsClassifier(n_neighbors = 5, weights = 'distance', n_jobs = -1)

# Fit
start = time()
estimator.fit(X_train_balanced, y_train_balanced)
train_time = time() - start

# Prediction
start = time()
y_pred = estimator.predict(X_test_scaled)
predict_time = time() - start

# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)

# Pre-processing
y_pred = np.ravel(y_pred)

# Print results
print("Accuracy = {:.4f} | time = {:,.3f}s/{:,.3f}s".format(accuracy, train_time, predict_time))

# Plot confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=estimator.classes_)
print("\nConfusion matrix")
print(cm)

#metrics.ConfusionMatrixDisplay.from_estimator(estimator, X_test, y_test, display_labels=estimator.classes_, normalize='all', cmap='Greys')
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=estimator.classes_, normalize='all', cmap='Greys')
plt.title("Confusion matrix")
plt.figure(figsize=(10,10))
plt.show()

# Classification report
print(metrics.classification_report(y_test, y_pred, digits=4))

# ROC curve and AUC
fig, ax = plt.subplots()
metrics.RocCurveDisplay.from_estimator(estimator, X_test_scaled, y_test, alpha=0.8, lw=2, ax=ax)
ax.plot([0,1], [0,1], linestyle="--", lw=2, color = "r", label = "Change", alpha = 0.5)
plt.title("ROC Curve")
plt.figure(figsize=(10,10))
plt.show()

# Area Under Curve (AUC)

#auc = metrics.roc_auc_score(y_test, estimator.decision_function(X_test))
# Check this https://scikit-learn.org/stable/modules/model_evaluation.html#roc-auc-binary
# where "estimator.predict_proba(X_test_scaled)[:, 1]" corresponds to the probability of the class with the "greater label", that is why the 1 as the index.
y_score = estimator.predict_proba(X_test_scaled)[:, 1]
auc = metrics.roc_auc_score(y_test, y_score)
print("\nArea Under Curve (AUC) = {:.4f}".format(auc))

In [None]:
#print(f"Classes: {knc.classes_}")
#print(f"Effective metric: {knc.effective_metric_}")
#print(f"Effective metric params: {knc.effective_metric_params_}")
#print(f"N. samples fit: {knc.n_samples_fit_}")

#### Cross-validation for KNN

In [None]:
start = time()
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=LeaveOneOut(), scoring='f1_macro', n_jobs=-1, verbose=1)
#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_scores = cross_val_score(estimator, X_train_balanced, y_train_balanced, cv=10, scoring='f1_macro', n_jobs=-1, verbose=1)
cross_validation_time = time() - start
print("Mean cross-validation scores (Stratified CV) = {:.4f}".format(np.mean(cross_validation_scores)))
print("Standard deviation for cross-validation scores (Stratified CV) = {:.4f}".format(np.std(cross_validation_scores)))
print("Reported cross-validation time = {:,.3f}s".format(cross_validation_time))
print("All cross-validation scores:\n", cross_validation_scores)

# AutoML with LazyPredict

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
# Fit
start = time()
models, predictions = clf.fit(X_train_balanced, X_test_scaled, y_train_balanced, y_test)
lazy_classifier_time = time() - start

# Print results
print("time = {:,.3f}s".format(lazy_classifier_time))

In [None]:
print(models)

# AutoML with auto-sklearn

In [None]:
#automl = autosklearn.classification.AutoSklearnClassifier(
    #include_estimators=["decision_tree", "k_nearest_neighbors", "random_forest", ], exclude_estimators=None,
    #include_preprocessors=["fast_ica", "pca", "polynomial", "no_preprocessing", "truncatedSVD", ], exclude_preprocessors=None,
    #n_jobs=4)
#automl.fit(X_train, y_train)
#y_pred_auto = automl.predict(X_test)
#print("Accuracy score of AutoML", metrics.accuracy_score(y_test, y_pred_auto))

In [None]:
#automl.cv_results_

In [None]:
#automl.sprint_statistics()

In [None]:
#automl.show_models()

In [None]:
#print(automl.show_models())