**Inspect values for DAB thresholding setting**

In [1]:
# load important libraries
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Cell_pipeline/Cell_classification/')
from base import *
from constants import *
import joblib 

In [2]:
import pandas as pd
import numpy as np

Idea: 
1. There can be bleeding between Hematoxylin <-> DAB 
2. In cases where there is no tau, highly pigmented hema can bleed to have false positive tau. 
3. We want to study tau negative cells, and need to find a suitable threshold
4. Hema can be highly pigmented because of heavy tau -> so we need to look at healthy cells = just use controls**
5. Using control, we check highly pigmented Hema & see value of DAB, and try to threshold that way.
    - We pick up top 1% hema cells, then look at DAB value. => Not good because they will be mainly artefacts (ink stains)
    - So, we will use predicted cells (cells not discarded) & check 75% percentile of highly pigmented cells -> DAB value
    


In [14]:
# functions

# to extract top 10% of predicted cells & check DAB value
def check_hema_dab(file_list,
                   file_path,
                   percentile):
    describe_list = []
    mean_hema = []
    to_exclude = ['Ambiguous','Excluded','Unlabelled']
    for i in file_list[0]:
        dat_orig = pd.read_csv(file_path + i,sep="\t")
        
        dat = dat_orig[['Centroid_X','Centroid_Y','Hematoxylin: Nucleus: Mean','DAB: Nucleus: Mean','Class']]

        # select only predicted class 
        dat_selected = dat[~dat['Class'].isin(to_exclude)]

        # select only top 10% of predicted cells with high hematoxylin staining intensity 
        val = np.percentile(dat_selected['Hematoxylin: Nucleus: Mean'],percentile)
        dat_selected2 = dat_selected[dat_selected['Hematoxylin: Nucleus: Mean']>=val]
        mean_hema.append(np.mean(dat_selected2['Hematoxylin: Nucleus: Mean']))
        description = dat_selected2['DAB: Nucleus: Mean'].describe()
        describe_list.append(description)
    return describe_list, mean_hema

# To summarise info from check_hema_dab
def dab_hema_summary(describe_list, mean_hema,file_list): 
    print('---------Mean DAB across slides-----') # abit too conservative - will miss out quite a lot of tau negative cells
    print('max of mean: ',np.max([i['mean'] for i in describe_list]))
    print('min of mean: ',np.min([i['mean'] for i in describe_list]))
    print('mean of mean: ',np.mean([i['mean'] for i in describe_list]))
    print('---------75% DAB across slides---------')  # probably a good compromise
    print('max of 75%: ',np.max([i['75%'] for i in describe_list])) # print max 75th percentile value (most pigmented slide)
    print('min of 75%: ',np.min([i['75%'] for i in describe_list])) # print min 75th percentile value (least pigmented slide)
    print('mean of 75%: ',np.mean([i['75%'] for i in describe_list]))
    print('---------Max DAB across slides---------') # these are likely artefacts or some cells have tau ?  
    print('max of max: ',np.max([i['max'] for i in describe_list]))
    print('min of max: ',np.min([i['max'] for i in describe_list]))
    print('mean of max: ',np.mean([i['max'] for i in describe_list]))
    
    # Now we will find a slide with max pigmentation (from top 10% highly pigmented cells in the slide)
    i_max = mean_hema.index(np.max(mean_hema))
    #  & a slide with min pigmentation
    i_min = mean_hema.index(np.min(mean_hema))
    # print slide number 
    print(file_list[0][i_max],np.max(mean_hema))
    # print('DAB of highly pigmented slide:', describe_list[i_max])
    print(file_list[0][i_min],np.min(mean_hema))
    # print('DAB of least pigmented slide:', describe_list[i_min])

**Cortical slides**

In [12]:
cortical_list = pd.read_csv('C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/Cortical/controls.txt',sep='\t',header=None)
file_path = 'C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/Cortical/'

In [15]:
# Select top 10% of highly pigmented hema cells, check value of DAB: mean, max, 75% percentile 
describe_cortical, cortical_mean_hema = check_hema_dab(file_list = cortical_list,
                                   file_path = file_path,
                                   percentile=90)

# Of the top 10% of highly pigmented hema cells for all slides we have, let's inspect DAB values
# mean = mean value from each slide 
# 75% = 75% percentile value from each slide
# max = max value from each slide
dab_hema_summary(describe_list = describe_cortical,
                  mean_hema = cortical_mean_hema,
                  file_list = cortical_list)

---------Mean DAB across slides-----
max of mean:  0.23087401687509415
min of mean:  0.1082975157629256
mean of mean:  0.1519149730015387
---------75% DAB across slides---------
max of 75%:  0.2521
min of 75%:  0.1218
mean of 75%:  0.16913571428571433
---------Max DAB across slides---------
max of max:  1.063
min of max:  0.5204
mean of max:  0.7298047619047617
755883.svs_predictions.txt 0.7452118879011602
755524.svs_predictions.txt 0.540801866330391


**Let's look at occipital slides**

In [6]:
occipital_list = pd.read_csv('C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/Occipital/controls.txt',sep='\t',header=None)
file_path = 'C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/Occipital/'

In [7]:
# Select top 10% of highly pigmented hema cells, check value of DAB: mean, max, 75% percentile 
describe_occipital, occipital_mean_hema = check_hema_dab(file_list = occipital_list,
                                   file_path = file_path,
                                   percentile=90)

# Of the top 10% of highly pigmented hema cells, let's inspect DAB values
# mean = mean value from each slide 
# 75% = 75% percentile value from each slide
# max = max value from each slide
dab_hema_summary(describe_list = describe_occipital,
                  mean_hema = occipital_mean_hema,
                  file_list = occipital_list)

---------Mean DAB across slides-----
max of mean:  0.24771807826086956
min of mean:  0.1378020860077022
mean of mean:  0.17335235187948933
---------75% DAB across slides---------
max of 75%:  0.2665
min of 75%:  0.1538
mean of 75%:  0.1902
---------Max DAB across slides---------
max of max:  0.7461
min of max:  0.5962
mean of max:  0.674475
755576.svs_predictions.txt 0.7312075999999998
755508.svs_predictions.txt 0.6671609962591061


**Let's look at BG slides**

In [8]:
bg_list = pd.read_csv('C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/BG/controls.txt',sep='\t',header=None)
file_path = 'C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/BG/'

In [9]:
# Select top 10% of highly pigmented hema cells, check value of DAB: mean, max, 75% percentile 
describe_bg, bg_mean_hema = check_hema_dab(file_list = bg_list,
                                   file_path = file_path,
                                   percentile=90)

# Of the top 10% of highly pigmented hema cells, let's inspect DAB values
# mean = mean value from each slide 
# 75% = 75% percentile value from each slide
# max = max value from each slide
dab_hema_summary(describe_list = describe_bg,
                  mean_hema = bg_mean_hema,
                  file_list = bg_list)

---------Mean DAB across slides-----
max of mean:  0.16214867534096253
min of mean:  0.1430056157439618
mean of mean:  0.1506628395827621
---------75% DAB across slides---------
max of 75%:  0.1761
min of 75%:  0.1547
mean of 75%:  0.16326000000000002
---------Max DAB across slides---------
max of max:  0.9329
min of max:  0.6732
mean of max:  0.8290200000000001
755497.svs_predictions.txt 0.7190593196425772
755511.svs_predictions.txt 0.643299681940165
