### Probabilistic classification: RF & adjust tuning step using 8 BG slides


Thresholding in the context of:

1) Compute the difference between p(class) - t(class) **

1) Sorted in descending order

2) Compare max to threshold, if less then we move on to compare the next most likely class against the threshold

3) Continue until probability is equal or greater than threshold, assign that class

4) If no classes get assigned, then that cell is labelled as 'ambiguous

In [1]:
## Importing libraries 

import pandas as pd
import numpy as np 
import random 

from sklearn import preprocessing 
from statistics import mean

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA 

from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedRandomForestClassifier


from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
from pprint import pprint 

from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve

#import ml_insights as mli 

### Importing data 

Require: 
1. input_files.txt - to contian filenames I want to use. ** currently .csv files

In [2]:
### 1) Importing annotated cells 
#Variables: mylist, inputs 

## obtaining list of files 
with open("D:/Tanrada_classification/imbalance_subcortical_training/annotated/training_slides.txt") as f: 
    mylist= f.read().splitlines()
    
print("Read in: ",len(mylist),"files")

## 2) reading in all those files 
inputs = [] 
for i in mylist: 
    dat = pd.read_csv('D:/Tanrada_classification/imbalance_subcortical_training/annotated/'+i,sep=",")
    ## Changing column names - since these names tend to be inconsistent causing problems 
    dat.columns.values[5] = 'Centroid_X'
    dat.columns.values[6] = 'Centroid_Y'
    dat.columns.values[8] = 'Nucleus: Area ¬µm^2'
    dat.columns.values[9] = 'Nucleus: Length ¬µm'
    dat.columns.values[12] = 'Nucleus: Max diameter ¬µm'
    dat.columns.values[13] = 'Nucleus: Min diameter ¬µm'
    dat.columns.values[14] = 'Cell: Area ¬µm^2'
    dat.columns.values[15] = 'Cell: Length ¬µm'
    dat.columns.values[18] = 'Cell: Max diameter ¬µm'
    dat.columns.values[19] = 'Cell: Min diameter ¬µm'
    #dat_cleaned = dat.iloc[:,0:61] ## SELECTING ONLY RELELVANT COLUMNS 
    print(i," number of features: ", dat.shape[1])
    inputs.append(dat)

print("Extracted:", len(inputs),"files")
print("Note: inconsistencies come from tau necrosis - will be removed later")
#Example
inputs[1].head()


Read in:  20 files
721708_Globus Pallidus_cell_annotations.csv  number of features:  62
721708_Striatum_cell_annotations.csv  number of features:  62
721708_Subthalamic Nucleus_cell_annotations.csv  number of features:  62
747308_Globus Pallidus_cell_annotations.csv  number of features:  62
747308_Striatum_cell_annotations.csv  number of features:  62
747370_Globus Pallidus_cell_annotations.csv  number of features:  62
747370_Striatum_cell_annotations.csv  number of features:  62
747370_Subthalamic Nucleus_cell_annotations.csv  number of features:  62
747814_Globus Pallidus_cell_annotations.csv  number of features:  62
747814_Striatum_cell_annotations.csv  number of features:  62
747814_Subthalamic Nucleus_cell_annotations.csv  number of features:  62
747820_Globus Pallidus_cell_annotations.csv  number of features:  62
747820_Striatum_cell_annotations.csv  number of features:  62
747820_Subthalamic Nucleus_cell_annotations.csv  number of features:  62
747828_Globus Pallidus_cell_annota

Unnamed: 0,Image,Name,Class,Parent,ROI,Centroid_X,Centroid_Y,Detection probability,Nucleus: Area ¬µm^2,Nucleus: Length ¬µm,...,DAB: Membrane: Median,DAB: Membrane: Min,DAB: Membrane: Max,DAB: Membrane: Std.Dev.,DAB: Cell: Mean,DAB: Cell: Median,DAB: Cell: Min,DAB: Cell: Max,DAB: Cell: Std.Dev.,tau: Necrosis area µm^2
0,721708.svs,Striatum,Ignore,PathAnnotationObject,Polygon,7987.5,1949.5,0.7316,22.9312,17.4807,...,0.0369,-0.0237,0.1842,0.0403,0.0636,0.0533,-0.0334,0.4475,0.0533,10.1613
1,721708.svs,Striatum,Oligo,PathAnnotationObject,Polygon,7983.8,1959.7,0.8573,23.5197,17.4146,...,0.0423,-0.024,0.1807,0.0395,0.0802,0.0553,-0.0642,0.6111,0.0919,21.5369
2,721708.svs,Striatum,Astro,PathAnnotationObject,Polygon,7989.2,1986.0,0.8765,21.6119,16.6321,...,0.0462,-0.034,0.1679,0.0411,0.0581,0.0479,-0.0673,0.3571,0.0508,10.0974
3,721708.svs,Striatum,Ignore,PathAnnotationObject,Polygon,7974.9,1988.0,0.7854,12.4837,13.1234,...,0.0493,-0.0155,0.2983,0.0511,0.07,0.0646,-0.0548,0.2983,0.0504,7.6689
4,721708.svs,Striatum,Neuron,PathAnnotationObject,Polygon,7952.9,1996.9,0.8648,49.3439,25.1681,...,0.035,0.0005,0.3993,0.0522,0.0684,0.0488,-0.0533,0.4499,0.0665,25.9466


In [3]:
# 2.5) Importing in neghbouring cells info (numbers)

nb_mylist = [i[0:6]+'_all_neighbours.csv' for i in mylist]
print("Read in:",len(nb_mylist)," NUMBER OF neighbouring cells files")

# reading in all those files 
nb_inputs = [] 
for i in nb_mylist: 
    dat = pd.read_csv("D:/number_of_neighbours/"+i,sep=",")
    nb_inputs.append(dat)
    
print("Extracted:", len(nb_inputs),"files")

Read in: 20  NUMBER OF neighbouring cells files
Extracted: 20 files


Note: since we use mylist to generate nb_mylist, file order is gauranteed to be the same.

In [4]:
# Inspecting number of columns in NNB files: 
for i in nb_inputs:
    print("Number of features ",i.shape[1])

Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  18
Number of features  13
Number of features  13


In [5]:
## Extracting only annotated cells from nb files 
inputs_=[]
n=len(inputs)
for i in range(0,n):
    #all cells on slide
    nb_dat_ = nb_inputs[i]
    nb_dat_ = nb_dat_.rename(columns={'X':'Centroid_X','Y':'Centroid_Y'})
    nb_dat = nb_dat_[['Centroid_X','Centroid_Y','NN_10_um','NN_20_um','NN_30_um','NN_40_um','NN_50_um'
                    ,'NN_60_um','NN_70_um','NN_80_um','NN_90_um','NN_100_um']] #so we have removed 'slice_id' or 'Image'
    print("nb_dat shape:", nb_dat.shape)
    #annotated cells with no nb info
    dat = inputs[i]
    print("dat shape:", dat.shape)
    
    #annotated cells with nb info: intersect between 2 dataframes 
    combined = dat.merge(nb_dat,on=['Centroid_X','Centroid_Y'],how='inner',validate='1:1') 
    inputs_.append(combined)
    print("Expected shape:", dat.shape[0],dat.shape[1]+nb_dat.shape[1]-2," Resulting shape:",combined.shape)
    print("--------------------------")
    
print("Succesfully combined nb cell counts to main data")

nb_dat shape: (746312, 12)
dat shape: (89, 62)
Expected shape: 89 72  Resulting shape: (89, 72)
--------------------------
nb_dat shape: (746312, 12)
dat shape: (102, 62)
Expected shape: 102 72  Resulting shape: (102, 72)
--------------------------
nb_dat shape: (746312, 12)
dat shape: (79, 62)
Expected shape: 79 72  Resulting shape: (79, 72)
--------------------------
nb_dat shape: (450168, 12)
dat shape: (75, 62)
Expected shape: 75 72  Resulting shape: (75, 72)
--------------------------
nb_dat shape: (450168, 12)
dat shape: (110, 62)
Expected shape: 110 72  Resulting shape: (110, 72)
--------------------------
nb_dat shape: (789034, 12)
dat shape: (63, 62)
Expected shape: 63 72  Resulting shape: (63, 72)
--------------------------
nb_dat shape: (789034, 12)
dat shape: (103, 62)
Expected shape: 103 72  Resulting shape: (103, 72)
--------------------------
nb_dat shape: (789034, 12)
dat shape: (79, 62)
Expected shape: 79 72  Resulting shape: (79, 72)
--------------------------
nb_dat 

In [6]:
### 2) Importing hema nucleus mean of all detected cells & location coordinates 
# Variables: hema_mylist, hema_inputs 

hema_mylist = [i[0:6]+'_hema.csv' for i in mylist]    
print("Read in:",len(hema_mylist),"hema files")    


## 4) reading in all those files 
hema_inputs = [] 
for i in hema_mylist: 
    dat = pd.read_csv('D:/Tanrada_classification/hema_novel/'+i,sep=",")
    dat.columns.values[0] = 'Centroid_X' # To fix naming inconsistency problem 
    dat.columns.values[1] = 'Centroid_Y'
    hema_inputs.append(dat)

print("Extracted:",len(hema_inputs),"hema files")  

#Example 
hema_inputs[2].head()


Read in: 20 hema files
Extracted: 20 hema files


Unnamed: 0,Centroid_X,Centroid_Y,Hematoxylin: Nucleus: Mean
0,7828.8,554.15,0.5298
1,7849.7,575.58,0.5649
2,7383.8,579.98,0.2996
3,7371.3,586.91,0.3383
4,7360.5,594.38,0.468


In [7]:
# Checking if filenames & order of them from mylist, nb_mylist & hema_mylist match
x_nb = [i[0:6] for i in nb_mylist]
x = [i[0:6] for i in mylist]
x_h = [i[0:6] for i in hema_mylist]
print("mylist & nb_list matched?:", x==x_nb)
print("mylist & hema_list matched?:",x==x_h)

mylist & nb_list matched?: True
mylist & hema_list matched?: True


### Normalising hematoxlyin per brain side & discard top 1%

In [8]:
### 1) Get instances needed to be remove for each slide
#Variables: hema_to_remove, hema_inputs 
hema_to_remove = [] 
for h in hema_inputs: 
    h2 = h.copy() 
    hema = h2['Hematoxylin: Nucleus: Mean']
    threshold = hema.quantile(0.99)
    hema_norm = hema/threshold 
    h2['Hematoxylin: Nucleus: Mean'] = hema_norm 
    h2 = h2[h2['Hematoxylin: Nucleus: Mean']>1] # to select instances need removing (keep <=1)
    hema_to_remove.append(h2)

for i in range(0,len(hema_to_remove)): 
    print(i, " No. of cells with normalised Hema >1:",len(hema_to_remove[i]), "from", len(hema_inputs[i]),"detected cells")

0  No. of cells with normalised Hema >1: 7455 from 746312 detected cells
1  No. of cells with normalised Hema >1: 7455 from 746312 detected cells
2  No. of cells with normalised Hema >1: 7455 from 746312 detected cells
3  No. of cells with normalised Hema >1: 4499 from 450168 detected cells
4  No. of cells with normalised Hema >1: 4499 from 450168 detected cells
5  No. of cells with normalised Hema >1: 7888 from 789034 detected cells
6  No. of cells with normalised Hema >1: 7888 from 789034 detected cells
7  No. of cells with normalised Hema >1: 7888 from 789034 detected cells
8  No. of cells with normalised Hema >1: 6935 from 694839 detected cells
9  No. of cells with normalised Hema >1: 6935 from 694839 detected cells
10  No. of cells with normalised Hema >1: 6935 from 694839 detected cells
11  No. of cells with normalised Hema >1: 7917 from 792783 detected cells
12  No. of cells with normalised Hema >1: 7917 from 792783 detected cells
13  No. of cells with normalised Hema >1: 7917 f

In [9]:
## 2) Discarding annotated cells if they fit the criteria above 
#Variables: cleaned_inputs, removed  

cleaned_inputs = []
removed = [] 
for n in range(0,(len(inputs_))): #looping through annotated % hema (detected) slides 
    
    i = inputs_[n] #annotated cells 
    h = hema_to_remove[n] #cells we need to remove, may or may not contain annotated cells 
    
    #Find cells that exist in both 'i' & 'h' = cells we want to remove 
    to_remove = i.merge(h,on=['Centroid_X','Centroid_Y'],how='inner',validate='1:1')
    
    #Find cells that only exist in 'i' but not in 'h' = cells we want to retain 
    to_retain = i.merge(h,on=['Centroid_X','Centroid_Y'],how='left',indicator=True,validate='1:1')
    
    #Extract cells we want to retain 
    retained = i[to_retain['_merge']=='left_only']
    
    cleaned_inputs.append(retained)
    removed.append(to_remove)
    print(mylist[n],":",i.shape[0]-retained.shape[0],"cells removed")


721708_Globus Pallidus_cell_annotations.csv : 0 cells removed
721708_Striatum_cell_annotations.csv : 0 cells removed
721708_Subthalamic Nucleus_cell_annotations.csv : 1 cells removed
747308_Globus Pallidus_cell_annotations.csv : 1 cells removed
747308_Striatum_cell_annotations.csv : 0 cells removed
747370_Globus Pallidus_cell_annotations.csv : 0 cells removed
747370_Striatum_cell_annotations.csv : 0 cells removed
747370_Subthalamic Nucleus_cell_annotations.csv : 1 cells removed
747814_Globus Pallidus_cell_annotations.csv : 0 cells removed
747814_Striatum_cell_annotations.csv : 4 cells removed
747814_Subthalamic Nucleus_cell_annotations.csv : 0 cells removed
747820_Globus Pallidus_cell_annotations.csv : 0 cells removed
747820_Striatum_cell_annotations.csv : 0 cells removed
747820_Subthalamic Nucleus_cell_annotations.csv : 0 cells removed
747828_Globus Pallidus_cell_annotations.csv : 0 cells removed
747828_Striatum_cell_annotations.csv : 2 cells removed
755497_Globus Pallidus_cell_annota

### Checking features

In [10]:
print(cleaned_inputs[0].shape)
list(cleaned_inputs[0].columns)

(89, 72)


['Image',
 'Name',
 'Class',
 'Parent',
 'ROI',
 'Centroid_X',
 'Centroid_Y',
 'Detection probability',
 'Nucleus: Area ¬µm^2',
 'Nucleus: Length ¬µm',
 'Nucleus: Circularity',
 'Nucleus: Solidity',
 'Nucleus: Max diameter ¬µm',
 'Nucleus: Min diameter ¬µm',
 'Cell: Area ¬µm^2',
 'Cell: Length ¬µm',
 'Cell: Circularity',
 'Cell: Solidity',
 'Cell: Max diameter ¬µm',
 'Cell: Min diameter ¬µm',
 'Nucleus/Cell area ratio',
 'Hematoxylin: Nucleus: Mean',
 'Hematoxylin: Nucleus: Median',
 'Hematoxylin: Nucleus: Min',
 'Hematoxylin: Nucleus: Max',
 'Hematoxylin: Nucleus: Std.Dev.',
 'Hematoxylin: Cytoplasm: Mean',
 'Hematoxylin: Cytoplasm: Median',
 'Hematoxylin: Cytoplasm: Min',
 'Hematoxylin: Cytoplasm: Max',
 'Hematoxylin: Cytoplasm: Std.Dev.',
 'Hematoxylin: Membrane: Mean',
 'Hematoxylin: Membrane: Median',
 'Hematoxylin: Membrane: Min',
 'Hematoxylin: Membrane: Max',
 'Hematoxylin: Membrane: Std.Dev.',
 'Hematoxylin: Cell: Mean',
 'Hematoxylin: Cell: Median',
 'Hematoxylin: Cell: Min',

### Removing DAB & tau necrosis & Image_name

In [11]:
## Removing DAB & tau necrosis ** making sure same dimension
cleaned_inputs_ = []
for i in cleaned_inputs:
    # To remove all DAB features
    to_drop1 = list(i.filter(regex='DAB'))
    dat1 = i[i.columns.drop(to_drop1)]
    # To remove tau necrosis features
    to_drop2 = list(dat1.filter(regex='tau'))
    dat2= dat1[dat1.columns.drop(to_drop2)]
    # To remove Smoothed ****
    to_drop3 = list(dat2.filter(regex='Smoothed'))
    dat3= dat2[dat2.columns.drop(to_drop3)]
    #Remove Image_name
    if ('image_name' in list(dat3.columns)):
        dat =dat3.drop(columns=['Image_name'])
    else:
        dat=dat3
    
    cleaned_inputs_.append(dat)
    print("Initial n_features:",i.shape[1], ", After removal:", dat.shape[1])

Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 71 , After removal: 51
Initial n_features: 71 , After removal: 51
Initial n_features: 72 , After removal: 51
Initial n_features: 72 , After removal: 51


### Putting the slides together 

In [12]:
##Variables: labelled_orig, labelled_data 
#1) Put the slides together

labelled_orig = pd.concat(cleaned_inputs_)
print(labelled_orig.shape)

# 2) Extract relevant columns 
dat = labelled_orig.drop(columns=['Parent','ROI']) #we're not dropping 'NAME' here as with other scripts.
dat.head()


(1694, 51)


Unnamed: 0,Image,Name,Class,Centroid_X,Centroid_Y,Detection probability,Nucleus: Area ¬µm^2,Nucleus: Length ¬µm,Nucleus: Circularity,Nucleus: Solidity,...,NN_10_um,NN_20_um,NN_30_um,NN_40_um,NN_50_um,NN_60_um,NN_70_um,NN_80_um,NN_90_um,NN_100_um
0,721708.svs,Globus Pallidus,Oligo,12022.2,8715.8,0.8575,13.1628,13.2907,0.9364,1.0,...,1,1,2,7,10,15,22,30,37,50
1,721708.svs,Globus Pallidus,Oligo,12029.8,8722.0,0.8796,15.2611,14.0081,0.9773,1.0,...,1,1,1,4,10,17,24,34,38,49
2,721708.svs,Globus Pallidus,Oligo,10048.5,9974.4,0.867,16.7321,14.7662,0.9643,1.0,...,1,3,5,7,11,14,25,35,52,65
3,721708.svs,Globus Pallidus,Oligo,10029.8,9984.5,0.8827,18.6383,15.5263,0.9716,1.0,...,1,1,4,6,9,17,26,36,47,63
4,721708.svs,Globus Pallidus,Oligo,10031.0,9993.5,0.8682,21.4955,16.7517,0.9626,1.0,...,1,1,4,6,11,18,24,35,49,64


In [13]:
list(dat.columns)

['Image',
 'Name',
 'Class',
 'Centroid_X',
 'Centroid_Y',
 'Detection probability',
 'Nucleus: Area ¬µm^2',
 'Nucleus: Length ¬µm',
 'Nucleus: Circularity',
 'Nucleus: Solidity',
 'Nucleus: Max diameter ¬µm',
 'Nucleus: Min diameter ¬µm',
 'Cell: Area ¬µm^2',
 'Cell: Length ¬µm',
 'Cell: Circularity',
 'Cell: Solidity',
 'Cell: Max diameter ¬µm',
 'Cell: Min diameter ¬µm',
 'Nucleus/Cell area ratio',
 'Hematoxylin: Nucleus: Mean',
 'Hematoxylin: Nucleus: Median',
 'Hematoxylin: Nucleus: Min',
 'Hematoxylin: Nucleus: Max',
 'Hematoxylin: Nucleus: Std.Dev.',
 'Hematoxylin: Cytoplasm: Mean',
 'Hematoxylin: Cytoplasm: Median',
 'Hematoxylin: Cytoplasm: Min',
 'Hematoxylin: Cytoplasm: Max',
 'Hematoxylin: Cytoplasm: Std.Dev.',
 'Hematoxylin: Membrane: Mean',
 'Hematoxylin: Membrane: Median',
 'Hematoxylin: Membrane: Min',
 'Hematoxylin: Membrane: Max',
 'Hematoxylin: Membrane: Std.Dev.',
 'Hematoxylin: Cell: Mean',
 'Hematoxylin: Cell: Median',
 'Hematoxylin: Cell: Min',
 'Hematoxylin: Cel

### Extracting relevant cell classes

In [14]:
# 1) Check no. of cells / class of our data
print("Total",sum(dat['Class'].value_counts()),"cells")
dat['Class'].value_counts()


Total 1694 cells


Oligo              783
Endo               289
Neuron             200
Astro              187
Ignore             142
Epi                 28
fragmented          27
tuftedastrocyte     13
Ambiguous           12
ambiguous           10
Tumor                2
neuronaltau          1
Name: Class, dtype: int64

***Note that we did not use 'ambiguous cells' & I have removed 'tufted astrocyte' & 'neuronal tau' for now***

In [15]:
# 2) Selecting only relevant cell classes (Using stardist_error instead of ignore_new)
orig = dat.copy()
dat__ = dat[(dat['Class'] == 'Oligo') | (dat['Class'] == 'Neuron')
          | (dat['Class'] == 'Astro')| (dat['Class'] == 'Epi')
          | (dat['Class'] == 'Ignore')| (dat['Class'] == 'fragmented')| (dat['Class'] == 'Endo')]
dat__=dat__.reset_index(drop=True)

In [16]:
# 3) Changing Epithelial to Endothelial
classes = dat__['Class']
formatted_classes = ['Endo'  if (i == 'Epi') else i for i in classes]
dat_ = dat__.copy()
dat_['Class']=formatted_classes

In [17]:
# 4) Checking results from 3)
dat_['Class'].value_counts()

Oligo         783
Endo          317
Neuron        200
Astro         187
Ignore        142
fragmented     27
Name: Class, dtype: int64

In [18]:
# 6) Group Ignore, Endo & Fragmented cells as a single class called 'Others'
class_ = dat_['Class']
y = ['Others'if i == 'Endo' or i == 'Ignore' or i == 'fragmented' else i for i in class_ ]
dat = dat_
dat['Class'] = y 
print(dat['Class'].value_counts().sum())
dat['Class'].value_counts()


1656


Oligo     783
Others    486
Neuron    200
Astro     187
Name: Class, dtype: int64

In [19]:
orig_dat = dat.copy()

In [20]:
#if cv=10
dat['Class'].value_counts()/10

Oligo     78.3
Others    48.6
Neuron    20.0
Astro     18.7
Name: Class, dtype: float64

# Training the model

### Checking for any NA in the data

In [21]:
#checking for NAN 
## NEW 
print("Any NA in the data?: ",dat.isnull().sum().sum()==1)

#dat = dat.dropna()
# dat.isnull().sum().sum()
#print("Any NA in the data?: ",dat.isnull().sum().sum()==1)

Any NA in the data?:  False


### Create train, test sets 

In [22]:
dat.columns

Index(['Image', 'Name', 'Class', 'Centroid_X', 'Centroid_Y',
       'Detection probability', 'Nucleus: Area ¬µm^2', 'Nucleus: Length ¬µm',
       'Nucleus: Circularity', 'Nucleus: Solidity',
       'Nucleus: Max diameter ¬µm', 'Nucleus: Min diameter ¬µm',
       'Cell: Area ¬µm^2', 'Cell: Length ¬µm', 'Cell: Circularity',
       'Cell: Solidity', 'Cell: Max diameter ¬µm', 'Cell: Min diameter ¬µm',
       'Nucleus/Cell area ratio', 'Hematoxylin: Nucleus: Mean',
       'Hematoxylin: Nucleus: Median', 'Hematoxylin: Nucleus: Min',
       'Hematoxylin: Nucleus: Max', 'Hematoxylin: Nucleus: Std.Dev.',
       'Hematoxylin: Cytoplasm: Mean', 'Hematoxylin: Cytoplasm: Median',
       'Hematoxylin: Cytoplasm: Min', 'Hematoxylin: Cytoplasm: Max',
       'Hematoxylin: Cytoplasm: Std.Dev.', 'Hematoxylin: Membrane: Mean',
       'Hematoxylin: Membrane: Median', 'Hematoxylin: Membrane: Min',
       'Hematoxylin: Membrane: Max', 'Hematoxylin: Membrane: Std.Dev.',
       'Hematoxylin: Cell: Mean', 'Hema

In [23]:
#We are using the entire dataset to train the model, test data will be provided later by Sanne 
X_train_l = dat.drop(columns=['Class']) #,'Image_name','Image_x', 'Image_y'
X_train = X_train_l.drop(columns=['Image','Name','Centroid_X','Centroid_Y']) 
print('training data shape:',X_train.shape)
y_train = dat['Class']

training data shape: (1656, 44)


In [24]:
X_train.head()

Unnamed: 0,Detection probability,Nucleus: Area ¬µm^2,Nucleus: Length ¬µm,Nucleus: Circularity,Nucleus: Solidity,Nucleus: Max diameter ¬µm,Nucleus: Min diameter ¬µm,Cell: Area ¬µm^2,Cell: Length ¬µm,Cell: Circularity,...,NN_10_um,NN_20_um,NN_30_um,NN_40_um,NN_50_um,NN_60_um,NN_70_um,NN_80_um,NN_90_um,NN_100_um
0,0.8575,13.1628,13.2907,0.9364,1.0,4.915,3.4944,103.1323,37.8404,0.9051,...,1,1,2,7,10,15,22,30,37,50
1,0.8796,15.2611,14.0081,0.9773,1.0,4.7992,4.1628,120.9785,40.6755,0.9189,...,1,1,1,4,10,17,24,34,38,49
2,0.867,16.7321,14.7662,0.9643,1.0,5.0758,4.3099,100.9326,38.6687,0.8482,...,1,3,5,7,11,14,25,35,52,65
3,0.8827,18.6383,15.5263,0.9716,1.0,5.3159,4.4388,124.4651,42.9339,0.8485,...,1,1,4,6,9,17,26,36,47,63
4,0.8682,21.4955,16.7517,0.9626,1.0,5.703,5.0028,136.2816,44.2523,0.8745,...,1,1,4,6,11,18,24,35,49,64


In [25]:
X_train.columns

Index(['Detection probability', 'Nucleus: Area ¬µm^2', 'Nucleus: Length ¬µm',
       'Nucleus: Circularity', 'Nucleus: Solidity',
       'Nucleus: Max diameter ¬µm', 'Nucleus: Min diameter ¬µm',
       'Cell: Area ¬µm^2', 'Cell: Length ¬µm', 'Cell: Circularity',
       'Cell: Solidity', 'Cell: Max diameter ¬µm', 'Cell: Min diameter ¬µm',
       'Nucleus/Cell area ratio', 'Hematoxylin: Nucleus: Mean',
       'Hematoxylin: Nucleus: Median', 'Hematoxylin: Nucleus: Min',
       'Hematoxylin: Nucleus: Max', 'Hematoxylin: Nucleus: Std.Dev.',
       'Hematoxylin: Cytoplasm: Mean', 'Hematoxylin: Cytoplasm: Median',
       'Hematoxylin: Cytoplasm: Min', 'Hematoxylin: Cytoplasm: Max',
       'Hematoxylin: Cytoplasm: Std.Dev.', 'Hematoxylin: Membrane: Mean',
       'Hematoxylin: Membrane: Median', 'Hematoxylin: Membrane: Min',
       'Hematoxylin: Membrane: Max', 'Hematoxylin: Membrane: Std.Dev.',
       'Hematoxylin: Cell: Mean', 'Hematoxylin: Cell: Median',
       'Hematoxylin: Cell: Min', 'Hem

### My own functions 

In [26]:
## Functions for custom classification metrics 

## Accuracy per class 
def astro_acc(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    acc_c = cm.diagonal()
    return acc_c[0] #astrocytes acc

def neuron_acc(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    acc_c = cm.diagonal()
    return acc_c[1] #neuron acc

def oligo_acc(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    acc_c = cm.diagonal()
    return acc_c[2] #oligo acc

def others_acc(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    acc_c = cm.diagonal()
    return acc_c[3] #ignore acc



## Confusion per class: 

## Astro
def A_as_N(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[0][1] # percentage that A is wrongly classified as N   

def A_as_O(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[0][2] # percentage that A is wrongly classified as O       


def A_as_Others(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[0][3]   

##Neurons 

def N_as_A(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[1][0] 

def N_as_O(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[1][2] 

def N_as_Others(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[1][3] 


## Oligo 

def O_as_A(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[2][0] 

def O_as_N(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[2][1] 

def O_as_Others(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[2][3] 

## Others 

def Others_as_A(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[3][0] 

def Others_as_N(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[3][1] 

def Others_as_O(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"],normalize='true')
    return cm[3][2] 


In [27]:
## Functions for custom classification metrics: RAW VALUES 


## Confusion per class: 

## Astro
def A_as_N_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[0][1] # percentage that A is wrongly classified as N   

def A_as_O_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[0][2] # percentage that A is wrongly classified as O       


def A_as_Others_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[0][3]   

##Neurons 

def N_as_A_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[1][0] 

def N_as_O_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[1][2] 

def N_as_Others_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[1][3] 


## Oligo 

def O_as_A_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[2][0] 

def O_as_N_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[2][1] 

def O_as_Others_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[2][3] 

## Others 

def Others_as_A_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[3][0] 

def Others_as_N_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[3][1] 

def Others_as_O_r(clf,X,y): 
    y_pred = clf.predict(X)
    cm = confusion_matrix(y,y_pred,labels=["Astro","Neuron","Oligo","Others"])
    return cm[3][2] 


In [28]:
### Precision-recall score
def precision_recall_auc(clf,X,y):
    #Variables
    pr_score={}
    
    #get y prob predictions
    y_prob_pred = clf.predict_proba(X)
    
    #Convert true y name into numerical classes
    y_true_numeric = name_to_numeric_classes(y)
    
    #get number of classes
    n_class = list(set(y))
    
    #create PR curve using OVR approach 
    for i in range(len(n_class)): # for each class, calculate roc_curve 
        p, r, thresh = precision_recall_curve(y_true_numeric, y_prob_pred[:,i], pos_label=i)
        pr_score[i] = auc(r,p) #recall on x axis, precision on y axis
        
    #Combine all pr-scores using 'macro' method
    pr_auc = mean(pr_score.values())
    return pr_auc
        

In [29]:
# ADDITIONAL FUNCTIONS

#FUNCTIONS

# Get all class-specific thresholds from best params

def get_threshold(best_params):
    class_thresholds=[]
    for i in best_params:
        t = best_params[i][0]
        class_thresholds.append(t)
    return class_thresholds 

#Thresholding method 5: using ratio, ambiguous cells are: 
# 1) When predicted probabilities < thresholds, i.e. diff = -ve (lower end)
# 2) When more than 1 class-specific threshold is passed, i.e. more than 1 positive diff scores (upper end)

def threshold_list_of_classes_5(y_pred_prob,best_params):
    
    thresholded_classes=[]
    for i in y_pred_prob: #for each cell (containing 4 class probabilities)
        
        #Get cell 4 class-specific threshold values: 
        thresholds = get_threshold(best_params)
        
        # Calculate predicted probability - threshold = difference for each of the 4 classes 
        differences = (i-thresholds)/thresholds
        
        #Count number of positive or equal (0) differences 
        count = np.count_nonzero(differences>=0)
        
        if (count==1): #only assign class when 1 class passes the threshold 
            pred_class = np.argmax(differences)
        else: #Otherwise, label as ambiguous (when more than 1 class passes, or when no class passes)
            pred_class=4

        #putting prediction in a list
        thresholded_classes.append(pred_class)

    thresholded_classes_ = numeric_to_name_classes(thresholded_classes)
    return thresholded_classes_
             
#Thresholding method 4  using ratio instead of raw difference
def threshold_list_of_classes_4 (y_pred_prob,best_params):
    
    thresholded_classes=[]
    for i in y_pred_prob: #for each cell (containing 4 class probabilities)
        
        #Get cell 4 class-specific threshold values: 
        thresholds = get_threshold(best_params)
        
        # Calculate predicted probability - threshold = difference for each of the 4 classes 
        differences = (i-thresholds)/thresholds
        
        #Check if there is at last one positive diff probability
        if (np.any(differences>0)): # If true, 
            
            #get index (indicative of class) of the highest probability difference
            pred_class = np.argmax(differences) 
            
        else: #If all diff probabilities are NEGATIVE
            pred_class = 4 # Assign cell class as 'ambiguous'

        #putting prediction in a list
        thresholded_classes.append(pred_class)

    thresholded_classes_ = numeric_to_name_classes(thresholded_classes)
    return thresholded_classes_


#Thresholding method 3)

def threshold_list_of_classes_3 (y_pred_prob,best_params):
    
    thresholded_classes=[]
    for i in y_pred_prob: #for each cell (containing 4 class probabilities)
        
        #Get cell 4 class-specific threshold values: 
        thresholds = get_threshold(best_params)
        
        # Calculate predicted probability - threshold = difference for each of the 4 classes 
        differences = i-thresholds
        
        #Check if there is at last one positive diff probability
        if (np.any(differences>0)): # If true, 
            
            #get index (indicative of class) of the highest probability difference
            pred_class = np.argmax(differences) 
            
        else: #If all diff probabilities are NEGATIVE
            pred_class = 4 # Assign cell class as 'ambiguous'

        #putting prediction in a list
        thresholded_classes.append(pred_class)

    thresholded_classes_ = numeric_to_name_classes(thresholded_classes)
    return thresholded_classes_

#Thresholding method 2)
def threshold_list_of_classes_2(y_pred_prob,best_params):
    
    thresholded_classes=[]
    for i in y_pred_prob: #for each cell 
        
        #Get threshold values: 
        thresholds = get_threshold(best_params)
        
        # Calculate predicted probability - threshold = difference 
        differences = i-thresholds
               
        #get index (indicative of class) of the highest probability
        pred_class = np.argmax(differences)         

        #putting prediction in a list
        thresholded_classes.append(pred_class)

        
    thresholded_classes_ = numeric_to_name_classes(thresholded_classes)
    return thresholded_classes_


# To convert numeric classes to its corresponding name classes 
def numeric_to_name_classes(numeric_classes):
    output=[]
    for i in numeric_classes:
        if (i==0):
            c = 'Astro'
        elif(i==1):
            c='Neuron'
        elif(i==2):
            c='Oligo'
        elif(i==3):
            c='Others'
        elif(i==4):
            c='Ambiguous'
        else:
            print('SOMETHING IS WRONG')
        output.append(c)
    return output

# To convert name classes to its corresponding numeric classes: 
def name_to_numeric_classes(name_classes):
    output=[]
    for i in name_classes: 
        if (i == 'Astro'): 
            x=0
        elif(i == 'Neuron'): 
            x=1
        elif(i == 'Oligo'): 
            x=2
        elif(i=='Others'):
            x=3
        elif(i=='Ambiguous'):
            x=4
        else:
            print('SOMETHING IS WRONG')
            break
        output.append(x)
    return output
    
# Create roc curve for each class in multi-classification problem

def multiclass_roc_curves(n_class,test_y_numeric,predy):
    
    fpr = {}
    tpr = {}
    thresh ={}
    
    #calcualte roc curve locations 
    for i in range(n_class): # for each class, calculate roc_curve 
        fpr[i], tpr[i], thresh[i] = roc_curve(test_y_numeric, predy[:,i], pos_label=i)
    
    return fpr,tpr,thresh

#Find the best position on the roc curve for multi-classification problem
def best_param_gmean(n_class,fpr,tpr,thresh):
    #calculate g-mean for each threshold 
    #gmeans={}
    best_params={}
    class_names=['Astro','Neuron','Oligo','Others']
    for i in range(n_class):
        tpr_ = tpr[i]
        fpr_ = fpr[i]
        gm = np.sqrt(tpr_*(1-fpr_))
        t = thresh[i]
        #gmeans[i]=gm
        ix = np.argmax(gm)
        best_params[i] = (t[ix],gm[ix],fpr_[ix],tpr_[ix])
        #print(gm)
       # print(class_names[i],'Best Threshold=%f, G-Mean=%.3f, fpr=%f, tpr=%f' % (t[ix], gm[ix],fpr_[ix],tpr_[ix]))
        #print('--------------------------------------------------')
    return best_params

def multiclass_PR_curves(n_class,test_y_numeric,predy):
    
    precision = {}
    recall = {}
    thresh ={}
    
    #calcualte roc curve locations 
    for i in range(n_class): # for each class, calculate roc_curve 
        precision[i], recall[i], thresh[i] = precision_recall_curve(test_y_numeric, predy[:,i], pos_label=i)
    
    return precision,recall,thresh

#Find the best position on the roc curve for multi-classification problem
def best_param_f_score(n_class,precision,recall,thresh):
    #calculate g-mean for each threshold 
    #f_scores={}
    best_params={}
    class_names=['Astro','Neuron','Oligo','Others']
    for i in range(n_class):
        p = precision[i]
        r = recall[i]
        nu=(2*p*r)
        de=(p+r) 
        f_score = np.divide(nu,de,out=np.zeros_like(nu),where=de != 0) #(2*p*r)/(p+r)
        t = thresh[i]
        #f_scores[i]=f_score
        ix = np.argmax(f_score)
        best_params[i] = (t[ix],f_score[ix],p[ix],r[ix])
        #print(gm)
       # print(class_names[i],'Best Threshold=%f, G-Mean=%.3f, fpr=%f, tpr=%f' % (t[ix], gm[ix],fpr_[ix],tpr_[ix]))
        #print('--------------------------------------------------')
    return best_params

#Thresholding function
def prob_thresholding(y_pred_prob,y_pred,threshold):
    thresholded_class =[]
    for i in range(0,len(y_pred_prob)):
        if(max(y_pred_prob[i])<threshold):
            c='Ambiguous'
        else:
            c=y_pred[i]
        thresholded_class.append(c)
    return thresholded_class

#Removing ambiguous class from thresholded class & y_predict
def remove_amb_class(t_class,y_test):
    
    #Get indices of instances with no ambiguous label 
    x = pd.Series(t_class)
    y_pred_no_amb = x[x!='Ambiguous']
    y_pred_no_amb_indices = y_pred_no_amb.index
    
    #Extract these instances fom y_pred
    #y_predict_no_amb = y_predict.iloc[pred_no_amb_indices]
    
    #Subset y_test
    y_test_no_amb = y_test.iloc[y_pred_no_amb_indices]
    
    return (y_pred_no_amb,y_test_no_amb)

### Hyperparameter tuning - random forest

In [30]:
pipeline = Pipeline([
    ('normalizer',MinMaxScaler()),
    ('selector',RFE(SVC(kernel='linear'))), 
    ('clf',BalancedRandomForestClassifier())
])
#pipeline.set_params(clf=RandomForestClassifier())
pipeline.steps

[('normalizer', MinMaxScaler()),
 ('selector', RFE(estimator=SVC(kernel='linear'))),
 ('clf', BalancedRandomForestClassifier())]

In [31]:
ccp_alphas = [float(x) for x in np.linspace(start=0, stop=0.03, num=7) ]
ccp_alphas

[0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03]

cv=10

In [32]:
# ### Hyper parameters to tune

# #Number of trees in random forest 
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# #Number of features to consider at every split
# max_features = ['auto', 'sqrt']

# #Maximum number of levels in tree 
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)

# #Minimum number of samples required to split an internal node 
# min_samples_split = [2, 5, 10]

# #Minimum number of samples required at each leaf node 
# min_samples_leaf = [1, 2, 4]

# #Method for selecting samples for training each tree 
# bootstrap = [True, False]

# #sampling strategy
# sampling_strategy=['auto','all','not majority','majority']

# #ccp_alphas 
# ccp_alphas = [float(x) for x in np.linspace(start=0, stop=0.03, num=7) ]#[float(x) for x in np.linspace(start=0, stop=1, num=10) ]

# ## Create the random grid 
# random_grid = {'selector__n_features_to_select':[30,32,34,36,38,40],
#                 'clf__n_estimators': n_estimators,
#                'clf__max_features': max_features,
#                'clf__max_depth': max_depth,
#                'clf__min_samples_split': min_samples_split,
#                'clf__min_samples_leaf': min_samples_leaf,
#               'clf__bootstrap': bootstrap,
#               'clf__random_state':[42],
#                'clf__sampling_strategy':sampling_strategy, 
#                'clf__ccp_alpha':ccp_alphas
#              # 'clf__class_weight':['balanced']
#               } # newly added
# #pprint(random_grid)

# rf_random = RandomizedSearchCV(pipeline,
#                              param_distributions=random_grid, 
#                              n_iter=100,
#                              cv=10,
#                              verbose=2,
#                             random_state=42,
#                             n_jobs=-1,
#                               refit='PR_AUC', # use this metric to evaluate performance of parameters 
#                       scoring={'PR_AUC':precision_recall_auc,
#                           'roc_auc_ovr_weighted':'roc_auc_ovr_weighted',
#                             'roc_auc_ovo':'roc_auc_ovo',
#                               'balanced_accuracy':'balanced_accuracy',
#                                'f1_weighted':'f1_weighted',
#                                'Astro_accuracy': astro_acc,
#                                'Neuron_accuracy':neuron_acc,
#                                'Oligo_accuracy':oligo_acc,
#                                'Others_accuracy':others_acc,
#                                'A_as_N':A_as_N,
#                                'A_as_O':A_as_O,
#                                'A_as_Others':A_as_Others,
#                                'N_as_A':N_as_A,
#                                'N_as_O':N_as_O,
#                                'N_as_Others':N_as_Others,
#                                'O_as_A':O_as_A,
#                                'O_as_N':O_as_N,
#                                'O_as_Others':O_as_Others,
#                                'Others_as_A':Others_as_A,
#                                'Others_as_N':Others_as_N,
#                                'Others_as_O':Others_as_O
#                               })

# rf_random.fit(X_train,y_train)

# print(rf_random.best_score_)
# print(rf_random.best_params_)


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


KeyboardInterrupt: 

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
0.852294027848721
{'selector__n_features_to_select': 30, 'clf__sampling_strategy': 'not majority', 'clf__random_state': 42, 'clf__n_estimators': 600, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1, 'clf__max_features': 'sqrt', 'clf__max_depth': 60, 'clf__ccp_alpha': 0.0, 'clf__bootstrap': True}

In [None]:
# # Digging into more details 
# print("PR-AUC:",
#      rf_random.cv_results_['mean_test_PR_AUC'][rf_random.best_index_]*100)
# print("ROC-AUC:",
#      rf_random.cv_results_['mean_test_roc_auc_ovr_weighted'][rf_random.best_index_]*100)
# print("ROC-AUC:",
#      rf_random.cv_results_['mean_test_roc_auc_ovo'][rf_random.best_index_]*100)

# print("Balanced accuracy:",
#       rf_random.cv_results_['mean_test_balanced_accuracy'][rf_random.best_index_]*100)

# print("F1_weighted:",
#       rf_random.cv_results_['mean_test_f1_weighted'][rf_random.best_index_]*100)

# print("Astrocyte accuracy:",
#       rf_random.cv_results_['mean_test_Astro_accuracy'][rf_random.best_index_]*100)

# print("Neuron accuracy:",
#       rf_random.cv_results_['mean_test_Neuron_accuracy'][rf_random.best_index_]*100)

# print("Oligo accuracy:",
#       rf_random.cv_results_['mean_test_Oligo_accuracy'][rf_random.best_index_]*100)

# print("Others accuracy:",
#       rf_random.cv_results_['mean_test_Others_accuracy'][rf_random.best_index_]*100)


# print("Classified A as N:",
#       rf_random.cv_results_['mean_test_A_as_N'][rf_random.best_index_]*100)

# print("Classified A as O:",
#       rf_random.cv_results_['mean_test_A_as_O'][rf_random.best_index_]*100)

# print("Classified A as Others:",
#       rf_random.cv_results_['mean_test_A_as_Others'][rf_random.best_index_]*100)

# print("Classified N as A:",
#       rf_random.cv_results_['mean_test_N_as_A'][rf_random.best_index_]*100)

# print("Classified N as O:",
#       rf_random.cv_results_['mean_test_N_as_O'][rf_random.best_index_]*100)

# print("Classified N as Others:",
#       rf_random.cv_results_['mean_test_N_as_Others'][rf_random.best_index_]*100)

# print("Classified O as A:",
#       rf_random.cv_results_['mean_test_O_as_A'][rf_random.best_index_]*100)

# print("Classified O as N:",
#       rf_random.cv_results_['mean_test_O_as_N'][rf_random.best_index_]*100)

# print("Classified O as Others:",
#       rf_random.cv_results_['mean_test_O_as_Others'][rf_random.best_index_]*100)


# print("Classified Others as A:",
#       rf_random.cv_results_['mean_test_Others_as_A'][rf_random.best_index_]*100)

# print("Classified Others as N:",
#       rf_random.cv_results_['mean_test_Others_as_N'][rf_random.best_index_]*100)

# print("Classified Others as O:",
#       rf_random.cv_results_['mean_test_Others_as_O'][rf_random.best_index_]*100)
                                                       

PR-AUC: 85.2294027848721
ROC-AUC: 95.75814823565217
ROC-AUC: 94.30629270300071
Balanced accuracy: 76.4833961098148
F1_weighted: 81.00526513749949
Astrocyte accuracy: 61.929824561403514
Neuron accuracy: 82.0
Oligo accuracy: 92.70529049010061
Others accuracy: 69.2984693877551
Classified A as N: 16.608187134502923
Classified A as O: 12.923976608187132
Classified A as Others: 8.538011695906432
Classified N as A: 13.0
Classified N as O: 0.5
Classified N as Others: 4.5
Classified O as A: 3.586497890295358
Classified O as N: 0.25478740668614086
Classified O as Others: 3.4534242129178843
Classified Others as A: 8.205782312925171
Classified Others as N: 6.164965986394557
Classified Others as O: 16.33078231292517

## Manual cross validation, using PR curves

In [33]:
### manual cross-validation method 
x= X_train
y=y_train
## Setting up cross-validation 
skf = StratifiedKFold(n_splits=10) # shuffling = False, no need to set random_state
skf.get_n_splits(x,y) # using only training data
print(skf)

#for train_index, test_index in skf.split(x,y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    #print("Train_size:", train_index.size, "Test_size:", test_index.size)
    
## Training 
accuracies = []
accuracies_c = []

t_accuracies= []
t_accuracies_c= []

reports = []
reports_c = []

t_reports= []
t_reports_c= []

confusion_matrices = []
confusion_matrices_c = [] 

t_confusion_matrices=[]
t_confusion_matrices_c=[]

#train_features =[] 
#train_n_features=[]
y_preds = []
y_preds_c = []

y_preds_t =[]
y_preds_t_c =[]

y_prob_preds = []
y_prob_preds_c = []

y_cv_test=[]
x_cv_test=[]

roc_auc_scores=[]
roc_auc_scores_c=[]

log_losses=[]
log_losses_c=[]

#brier_scores=[]
#brier_scores_c=[]

best_parameters=[]
best_parameters_c=[]

for train_index, test_index in skf.split(x,y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    #print("Train_size:", train_index.size, "Test_size:", test_index.size)
    x_train_, x_test_ = x.iloc[train_index], x.iloc[test_index]
    y_train_, y_test_ = y[train_index], y[test_index]
    x_test_l = X_train_l.iloc[test_index]
    #print("n of each cell types at test:\n", y_test_.value_counts())
    
    ## 1) Create classifier 
    
    pipeline_final=Pipeline([
    ('normalizer',MinMaxScaler()),
    ('selector',RFE(SVC(kernel='linear'),n_features_to_select=30)), 
    ('clf', BalancedRandomForestClassifier(sampling_strategy= 'not majority', random_state= 42,
                                           n_estimators= 600, min_samples_split=10, min_samples_leaf= 1,
                                           max_features= 'sqrt', max_depth= 60, ccp_alpha= 0.0, bootstrap= True))
                                           ### MAKE SURE THESE ARE CORRECT 
                
])
# {'selector__n_features_to_select': 32, 'clf__sampling_strategy': 'not majority', 'clf__random_state': 42,
#  'clf__n_estimators': 1800, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 2,
#  'clf__max_features': 'auto', 'clf__max_depth': 40, 'clf__ccp_alpha': 0.005, 'clf__bootstrap': True}

# {'selector__n_features_to_select': 30, 'clf__sampling_strategy': 'not majority',
#  'clf__random_state': 42, 'clf__n_estimators': 600, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1,
#  'clf__max_features': 'sqrt', 'clf__max_depth': 60, 'clf__ccp_alpha': 0.0, 'clf__bootstrap': True}

    # 2) Train the calibrated classifier with 'training' data (x,y) 
    pipeline_final.fit(x_train_,y_train_)
    
    # 3) Get class probability predictions for 'test' data 
    y_prob_predict = pipeline_final.predict_proba(x_test_)
    
    # 4.1) For thresholding: convert y_test_ from name classes to numeric classes
    y_test_numeric = name_to_numeric_classes(y_test_)
    
    # 4.2) For thresholding: use predicted class probabilities to calculate ROC curve for each class vs rest
    
    precision,recall,thresh = multiclass_PR_curves(4,y_test_numeric,y_prob_predict)
    
    # 4.3) For thresholding: from ROC curves, find the best location (fpr,tpr,thresh) for each class 
    # Evaluated based on g-mean 
    best_params_ = best_param_f_score(4,precision,recall,thresh)
    best_parameters.append(best_params_)
    
    # 4.4) For thresholding: apply thresholding to each class to create crisp class label 
    t_class = threshold_list_of_classes_5(y_prob_predict,best_params_)
    
    # 5) Get class labels using default thresholding value (0.5)
    y_predict = pipeline_final.predict(x_test_)
    
    # 6) Put predictions (labels &probabilities & t_labels) in the corresponding list
    y_preds.append(y_predict)
    
    y_prob_preds.append(y_prob_predict)
    
    y_preds_t.append(t_class)
    
    y_cv_test.append(y_test_) # for visualisation purposes later on
    x_cv_test.append(x_test_l)
    
    # 7) Remove 'ambiguous class' from t_class & y_test_ - for accuracy calculation
    (y_predict_no_amb,y_test_no_amb) = remove_amb_class(t_class,y_test_)
    
    # 8) Calculate and put Performance metric (balanced accuracy) per fold into a list
    accuracies.append(balanced_accuracy_score(y_test_,y_predict)) ## using BALANCED ACC.
    
    t_accuracies.append(balanced_accuracy_score(y_test_no_amb,y_predict_no_amb))
    
    #8.1) Compute classification reports
    reports.append(classification_report(y_test_,y_predict,output_dict=True)) 
  #  reports_c.append(classification_report(y_test_,y_predict_c,output_dict=True)) 
    
    t_reports.append(classification_report(y_test_no_amb,y_predict_no_amb,output_dict=True))
    
    # 9) Calculate and put ROC AUC scores per fold into a list 
    roc_auc_scores.append(roc_auc_score(y_test_,y_prob_predict,multi_class='ovr',average='weighted'))
    
    #9.1) Calculate and put log loss per fold into a list
    log_losses.append(log_loss(y_test_,y_prob_predict))

    
    # 10) Create confusion matrices for default & thresholded results per fold then put in a list 
    cm = confusion_matrix(y_test_,y_predict, labels=["Astro","Neuron","Oligo","Others"]) #,normalize='true'
    
    cm_t = confusion_matrix(y_test_no_amb,y_predict_no_amb, labels=["Astro","Neuron","Oligo","Others"])#,normalize='true'
    
    confusion_matrices.append(cm)
    
    t_confusion_matrices.append(cm_t)

print('mean ROC AUC:',mean(roc_auc_scores))
print('--------------------------------')
print('mean log loss:',mean(log_losses))
print('--------------------------------')

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
mean ROC AUC: 0.9575814823565217
--------------------------------
mean log loss: 0.4988873125242989
--------------------------------


Extracting information from best parameters

In [34]:
#Thresholds 
astro_t = [] 
neuron_t=[]
oligo_t=[]
others_t=[]
#G-means
astro_gm=[]
neuron_gm=[]
oligo_gm=[]
others_gm=[]

#Info extraction 
for fold in best_parameters:
    a_t = fold[0][0]
    n_t = fold[1][0]
    o_t = fold[2][0]
    ot_t = fold[3][0]
    
    a_gm = fold[0][1]
    n_gm = fold[1][1]
    o_gm = fold[2][1]
    ot_gm = fold[3][1]
    
    astro_t.append(a_t)
    neuron_t.append(n_t)
    oligo_t.append(o_t)
    others_t.append(ot_t)
    
    astro_gm.append(a_gm)
    neuron_gm.append(n_gm)
    oligo_gm.append(o_gm)
    others_gm.append(ot_gm)

In [35]:
print("NON CALIBRATED")
print('mean astro threshold:', mean(astro_t), ', mean f1_macro:',mean(astro_gm))
print('mean neuron threshold:', mean(neuron_t), ', mean f1_macro:',mean(neuron_gm))
print('mean oligo threshold:', mean(oligo_t), ', mean f1_macro:',mean(oligo_gm))
print('mean others threshold:', mean(others_t), ', mean f1_macro:',mean(others_gm))
print(mean(astro_t)+mean(neuron_t)+mean(oligo_t)+mean(others_t)) #is it okay that it is above 1? 

NON CALIBRATED
mean astro threshold: 0.3973044856910375 , mean f1_macro: 0.6694727377445115
mean neuron threshold: 0.45273629257841896 , mean f1_macro: 0.8547011614208093
mean oligo threshold: 0.5795161596615945 , mean f1_macro: 0.9204556173272136
mean others threshold: 0.24605117846048574 , mean f1_macro: 0.8408519357073003
1.6756081163915366


**NO Thresholding:** Non-calibrated

In [36]:
#Confusion matrix across 10 folds, WITHOUT thresholding 
print('with no thresholding:',mean(accuracies)*100)
print('Macro avg F1 ',mean([i['macro avg']['f1-score'] for i in reports])*100)
print('Weighted avg F1 ',mean([i['weighted avg']['f1-score'] for i in reports])*100)

print("--------------------------")
C=sum(confusion_matrices)
final_cm =  C.astype('float') / C.sum(axis=1)[:, np.newaxis]*100 #normalize(sum(confusion_matrices))*100
print(C)
print(final_cm)
print("--------------------------")
print("Astro accuracy",final_cm[0][0])
print("Neuron accuracy",final_cm[1][1])
print("Oligo accuracy",final_cm[2][2])
print("Others accuracy",final_cm[3][3])
print("--------------------------")
# F1-score per class: 
print('Astro f1-score ',mean([i['Astro']['f1-score'] for i in reports])*100)
print('Neuron f1-score ',mean([i['Neuron']['f1-score'] for i in reports])*100)
print('Oligo f1-score ',mean([i['Oligo']['f1-score'] for i in reports])*100)
print('Others f1-score ',mean([i['Others']['f1-score'] for i in reports])*100)
print("--------------------------")
print('Macro avg precision',mean([i['macro avg']['precision'] for i in reports])*100)
print('Macro avg recall ',mean([i['macro avg']['recall'] for i in reports])*100)

with no thresholding: 76.4833961098148
Macro avg F1  75.59362112648908
Weighted avg F1  81.00526513749949
--------------------------
[[116  31  24  16]
 [ 26 164   1   9]
 [ 28   2 726  27]
 [ 40  30  79 337]]
[[62.03208556 16.57754011 12.8342246   8.55614973]
 [13.         82.          0.5         4.5       ]
 [ 3.57598978  0.25542784 92.72030651  3.44827586]
 [ 8.23045267  6.17283951 16.25514403 69.34156379]]
--------------------------
Astro accuracy 62.03208556149733
Neuron accuracy 82.0
Oligo accuracy 92.72030651340997
Others accuracy 69.34156378600824
--------------------------
Astro f1-score  58.945445973757394
Neuron f1-score  76.7495487617384
Oligo f1-score  90.03034845776652
Others f1-score  76.64914131269401
--------------------------
Macro avg precision 76.35357580147054
Macro avg recall  76.4833961098148


**Thresholding:** Non-calibrated

In [37]:
#0.5 
#Confusion matrix across 10 folds, WITH thresholding 
print('with thresholding (non-calibrated) ACC :',mean(t_accuracies)*100)
print('Macro avg F1 ',mean([i['macro avg']['f1-score'] for i in t_reports])*100)
print('Weighted avg F1 ',mean([i['weighted avg']['f1-score'] for i in t_reports])*100)
print("--------------------------")
C_t=sum(t_confusion_matrices)
final_cm_t =  C_t.astype('float') / C_t.sum(axis=1)[:, np.newaxis]*100
print(C_t)
print(final_cm_t)
print("--------------------------")
print("Astro accuracy",final_cm_t[0][0])
print("Neuron accuracy",final_cm_t[1][1])
print("Oligo accuracy",final_cm_t[2][2])
print("Others accuracy",final_cm_t[3][3])
print('------------------------------')
# F1-score per class: 
print('Astro f1-score ',mean([i['Astro']['f1-score'] for i in t_reports])*100)
print('Astro precision ',mean([i['Astro']['precision'] for i in t_reports])*100)
print('Astro recall ',mean([i['Astro']['recall'] for i in t_reports])*100)
print("--------------------------")
print('Neuron f1-score ',mean([i['Neuron']['f1-score'] for i in t_reports])*100)
print('Neuron precision ',mean([i['Neuron']['precision'] for i in t_reports])*100)
print('Neuron recall ',mean([i['Neuron']['recall'] for i in t_reports])*100)
print("--------------------------")
print('Oligo f1-score ',mean([i['Oligo']['f1-score'] for i in t_reports])*100)
print('Oligo precision ',mean([i['Oligo']['precision'] for i in t_reports])*100)
print('Oligo recall ',mean([i['Oligo']['recall'] for i in t_reports])*100)
print("--------------------------")
print('Others f1-score ',mean([i['Others']['f1-score'] for i in t_reports])*100)
print('Others precision ',mean([i['Others']['precision'] for i in t_reports])*100)
print('Others recall ',mean([i['Others']['recall'] for i in t_reports])*100)
print("--------------------------")
print('Macro avg precision',mean([i['macro avg']['precision'] for i in t_reports])*100)
print('Macro avg recall ',mean([i['macro avg']['recall'] for i in t_reports])*100)

# Checking on disagreements 


thresholded_preds = pd.concat([pd.DataFrame(i) for i in y_preds_t])
thresholded_preds = thresholded_preds.rename(columns={0:'t_Class'})
thresholded_preds = thresholded_preds.reset_index(drop=True)


preds = pd.concat([pd.DataFrame(i) for i in y_preds])
preds = preds.rename(columns={0:'Class'})
preds = preds.reset_index(drop=True)

truth = pd.concat([pd.DataFrame(i) for i in y_cv_test])
truth = truth.rename(columns={'Class':'Truth'})
truth = truth.reset_index(drop=True)

# x_truth = pd.concat([pd.DataFrame(i[['Image','Centroid_X','Centroid_Y']]) for i in x_cv_test])

#Combine absolute prediction to thresholded prediction

#get predicted probabilities
p_probs=pd.concat([pd.DataFrame(i) for i in y_prob_preds])
p_probs= p_probs.rename(columns={0:'Astro',1:'Neuron',2:'Oligo',3:'Others'})
p_probs = p_probs.reset_index(drop=True)

results = thresholded_preds.copy()
results.loc[:,'Class']=preds
results.loc[:,'Truth'] = truth
results.loc[:,'Astro'] = p_probs['Astro']
results.loc[:,'Neuron'] = p_probs['Neuron']
results.loc[:,'Oligo'] = p_probs['Oligo']
results.loc[:,'Others'] = p_probs['Others']
# results.loc[:,'Image'] = x_truth['Image']
# results.loc[:,'Centroid_X'] = x_truth['Centroid_X']
# results.loc[:,'Centroid_Y'] = x_truth['Centroid_Y']


#Calculate agreement between the two 
results.loc[:,'agreement'] = (results['t_Class']==results['Class'])*1
agreements = results['agreement'].value_counts()
print('Agreement: ',agreements[1],'/',agreements[1]+agreements[0],'=> ',(agreements[1]/(agreements[1]+agreements[0])*100,'%') )
print('Disagreement: ',agreements[0],'/',agreements[1]+agreements[0],'=> ',(agreements[0]/(agreements[1]+agreements[0])*100,'%') )
print('------------------------------')
# Of those disagreed, what are they? (those with prob < 0.5)
print('Of the disagreements, what are they?')
disagreed = results[results['agreement']==0]
disagreed['Class'].value_counts()

with thresholding (non-calibrated) ACC : 87.16553912430388
Macro avg F1  86.49567115435103
Weighted avg F1  89.8817067045254
--------------------------
[[116  13   5  15]
 [ 15 163   1   3]
 [ 17   2 683  26]
 [ 11  10  32 367]]
[[77.85234899  8.72483221  3.3557047  10.06711409]
 [ 8.24175824 89.56043956  0.54945055  1.64835165]
 [ 2.33516484  0.27472527 93.81868132  3.57142857]
 [ 2.61904762  2.38095238  7.61904762 87.38095238]]
--------------------------
Astro accuracy 77.85234899328859
Neuron accuracy 89.56043956043956
Oligo accuracy 93.81868131868131
Others accuracy 87.38095238095238
------------------------------
Astro f1-score  75.35297468269604
Astro precision  75.50998537066648
Astro recall  77.89017273576097
--------------------------
Neuron f1-score  88.2370073689147
Neuron precision  87.19364689915908
Neuron recall  90.03801169590643
--------------------------
Oligo f1-score  94.20731751311571
Oligo precision  94.85318412080602
Oligo recall  93.7205978214672
----------------

Oligo     109
Astro      66
Neuron     47
Others     36
Name: Class, dtype: int64

In [38]:
thresholded_preds['t_Class'].value_counts()

Oligo        721
Others       411
Neuron       188
Ambiguous    177
Astro        159
Name: t_Class, dtype: int64

In [39]:
results.head()

Unnamed: 0,t_Class,Class,Truth,Astro,Neuron,Oligo,Others,agreement
0,Oligo,Oligo,Oligo,0.003373,0.0,0.867352,0.129275,1
1,Oligo,Oligo,Oligo,0.000764,0.0,0.987805,0.011431,1
2,Oligo,Oligo,Oligo,0.0,0.0,0.991724,0.008276,1
3,Oligo,Oligo,Oligo,0.000278,0.0,0.999484,0.000238,1
4,Oligo,Oligo,Oligo,0.054565,0.0,0.917978,0.027457,1


In [40]:
results[results['t_Class']!=results['Truth']]

Unnamed: 0,t_Class,Class,Truth,Astro,Neuron,Oligo,Others,agreement
8,Others,Others,Oligo,0.008141,0.002809,0.358683,0.630367,1
12,Others,Others,Oligo,0.259244,0.045184,0.221733,0.473839,1
23,Ambiguous,Astro,Astro,0.535422,0.182759,0.084736,0.197083,0
34,Neuron,Neuron,Oligo,0.354454,0.491473,0.075477,0.078596,1
49,Others,Others,Astro,0.052310,0.009289,0.293663,0.644738,1
...,...,...,...,...,...,...,...,...
1642,Neuron,Neuron,Astro,0.023917,0.974504,0.000167,0.001412,1
1644,Neuron,Neuron,Astro,0.037368,0.958042,0.000919,0.003671,1
1650,Astro,Astro,Oligo,0.905754,0.025336,0.063158,0.005752,1
1652,Others,Astro,Astro,0.458539,0.268606,0.038744,0.234111,0


In [41]:
x_test =pd.concat(x_cv_test)
x_test=x_test.reset_index(drop=True)
x_test_subset=x_test[['Image','Name','Centroid_X','Centroid_Y']]

In [42]:
results_=results.copy()
results_=results.reset_index(drop=True)

In [43]:
results_ = results_.join(x_test_subset)

In [44]:
results_.head()

Unnamed: 0,t_Class,Class,Truth,Astro,Neuron,Oligo,Others,agreement,Image,Name,Centroid_X,Centroid_Y
0,Oligo,Oligo,Oligo,0.003373,0.0,0.867352,0.129275,1,721708.svs,Globus Pallidus,12022.2,8715.8
1,Oligo,Oligo,Oligo,0.000764,0.0,0.987805,0.011431,1,721708.svs,Globus Pallidus,12029.8,8722.0
2,Oligo,Oligo,Oligo,0.0,0.0,0.991724,0.008276,1,721708.svs,Globus Pallidus,10048.5,9974.4
3,Oligo,Oligo,Oligo,0.000278,0.0,0.999484,0.000238,1,721708.svs,Globus Pallidus,10029.8,9984.5
4,Oligo,Oligo,Oligo,0.054565,0.0,0.917978,0.027457,1,721708.svs,Globus Pallidus,10031.0,9993.5


In [45]:
results_['Image'].value_counts()

747814.svs    275
721708.svs    257
747370.svs    244
747820.svs    223
771885.svs    203
747308.svs    182
747828.svs    149
755497.svs    123
Name: Image, dtype: int64

In [46]:
incorrect = results_[results_['t_Class']!=results_['Truth']]
incorrect.head()

Unnamed: 0,t_Class,Class,Truth,Astro,Neuron,Oligo,Others,agreement,Image,Name,Centroid_X,Centroid_Y
8,Others,Others,Oligo,0.008141,0.002809,0.358683,0.630367,1,721708.svs,Globus Pallidus,12000.0,10008.3
12,Others,Others,Oligo,0.259244,0.045184,0.221733,0.473839,1,721708.svs,Globus Pallidus,11999.4,10042.9
23,Ambiguous,Astro,Astro,0.535422,0.182759,0.084736,0.197083,0,721708.svs,Globus Pallidus,10857.7,10904.9
34,Neuron,Neuron,Oligo,0.354454,0.491473,0.075477,0.078596,1,721708.svs,Globus Pallidus,12103.7,11999.8
49,Others,Others,Astro,0.05231,0.009289,0.293663,0.644738,1,721708.svs,Globus Pallidus,11764.1,12906.0


In [47]:
incorrect['Truth'].value_counts()

Others    119
Oligo     100
Astro      71
Neuron     37
Name: Truth, dtype: int64

In [48]:
incorrect['Name'].value_counts()

Striatum               142
Globus Pallidus        107
Subthalamic Nucleus     78
Name: Name, dtype: int64

In [49]:
results_['Name'].value_counts()

Striatum               776
Globus Pallidus        580
Subthalamic Nucleus    300
Name: Name, dtype: int64

Confusion matrices:

In [50]:
GP_results = results_[results_['Name']=='Globus Pallidus']
GP_cm = confusion_matrix(GP_results['Truth'],GP_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo","Others"],normalize='true')*100
GP_cm

array([[77.08333333, 10.41666667,  0.        , 12.5       ],
       [ 4.16666667, 87.5       ,  0.        ,  8.33333333],
       [ 1.38408304,  0.34602076, 95.5017301 ,  2.76816609],
       [ 3.79746835,  0.63291139,  7.59493671, 87.97468354]])

In [51]:
GP_results = results_[results_['Name']=='Globus Pallidus']
GP_cm = confusion_matrix(GP_results['Truth'],GP_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo"],normalize='true')*100
GP_cm

array([[88.0952381 , 11.9047619 ,  0.        ],
       [ 4.54545455, 95.45454545,  0.        ],
       [ 1.42348754,  0.35587189, 98.22064057]])

In [52]:
STR_results = results_[results_['Name']=='Striatum']
STR_cm = confusion_matrix(STR_results['Truth'],STR_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo","Others"],normalize='true')*100
STR_cm

array([[82.05128205,  7.69230769,  2.56410256,  7.69230769],
       [ 7.35294118, 91.91176471,  0.        ,  0.73529412],
       [ 2.56410256,  0.        , 92.94871795,  4.48717949],
       [ 1.66666667,  4.44444444,  7.77777778, 86.11111111]])

In [53]:
STR_results = results_[results_['Name']=='Striatum']
STR_cm = confusion_matrix(STR_results['Truth'],STR_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo"],normalize='true')*100
STR_cm

array([[88.88888889,  8.33333333,  2.77777778],
       [ 7.40740741, 92.59259259,  0.        ],
       [ 2.68456376,  0.        , 97.31543624]])

In [54]:
STN_results = results_[results_['Name']=='Subthalamic Nucleus']
STN_cm = confusion_matrix(STN_results['Truth'],STN_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo","Others"],normalize='true')*100
STN_cm

array([[65.2173913 ,  8.69565217, 13.04347826, 13.04347826],
       [18.18181818, 77.27272727,  4.54545455,  0.        ],
       [ 3.93700787,  0.78740157, 92.12598425,  3.1496063 ],
       [ 2.43902439,  1.2195122 ,  7.31707317, 89.02439024]])

In [59]:
STN_results = results_[results_['Name']=='Subthalamic Nucleus']
STN_cm = confusion_matrix(STN_results['Truth'],STN_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo","Others"])
STN_cm

array([[ 15,   2,   3,   3],
       [  4,  17,   1,   0],
       [  5,   1, 117,   4],
       [  2,   1,   6,  73]], dtype=int64)

In [62]:
STN_results['t_Class'].value_counts()

Oligo        127
Others        80
Ambiguous     46
Astro         26
Neuron        21
Name: t_Class, dtype: int64

In [63]:
STN_results['Truth'].value_counts()

Oligo     137
Others    101
Astro      36
Neuron     26
Name: Truth, dtype: int64

In [55]:
STN_results = results_[results_['Name']=='Subthalamic Nucleus']
STN_cm = confusion_matrix(STN_results['Truth'],STN_results['t_Class']
                         ,labels=["Astro","Neuron","Oligo"],normalize='true')*100
STN_cm

array([[75.        , 10.        , 15.        ],
       [18.18181818, 77.27272727,  4.54545455],
       [ 4.06504065,  0.81300813, 95.12195122]])

In [56]:
# incorrect_as_file = incorrect[['Image','Truth','Centroid_X','Centroid_Y','t_Class']]
# path_ = 'D:/Tanrada_classification/imbalance_cortical_training/cortical_full_slide_predictions/Probabilistic_classification/Model8_occipital_classifier/cell_inspection/incorrect.txt'
# incorrect_as_file.to_csv(path_, sep='\t',index=False)

In [57]:
correct = results_[results_['t_Class']==results_['Truth']]
correct.head()

Unnamed: 0,t_Class,Class,Truth,Astro,Neuron,Oligo,Others,agreement,Image,Name,Centroid_X,Centroid_Y
0,Oligo,Oligo,Oligo,0.003373,0.0,0.867352,0.129275,1,721708.svs,Globus Pallidus,12022.2,8715.8
1,Oligo,Oligo,Oligo,0.000764,0.0,0.987805,0.011431,1,721708.svs,Globus Pallidus,12029.8,8722.0
2,Oligo,Oligo,Oligo,0.0,0.0,0.991724,0.008276,1,721708.svs,Globus Pallidus,10048.5,9974.4
3,Oligo,Oligo,Oligo,0.000278,0.0,0.999484,0.000238,1,721708.svs,Globus Pallidus,10029.8,9984.5
4,Oligo,Oligo,Oligo,0.054565,0.0,0.917978,0.027457,1,721708.svs,Globus Pallidus,10031.0,9993.5


In [58]:
# correct_as_file = correct[['Image','Truth','Centroid_X','Centroid_Y','t_Class']]
# path_ = 'D:/Tanrada_classification/imbalance_cortical_training/cortical_full_slide_predictions/Probabilistic_classification/Model8_occipital_classifier/cell_inspection/correct.txt'
# correct_as_file.to_csv(path_, sep='\t',index=False)