## Example script: training tau classifier for basal ganglia

**Read in relevant files**

In [1]:
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/')

from base import *
from constants import *
from tau_classification import * 
import joblib

### Load pre-trained models

**Screening classifier**

In [2]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Pre-trained/Models/"
filename = "screening_classifier_updated3.sav"
screening_model = joblib.load(path+filename)

In [3]:
# Some checks
print(screening_model.pipeline)
screening_model.f_importance.head()

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=46)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_features=1,
                                                min_samples_leaf=2,
                                                n_estimators=600,
                                                random_state=42))])


Unnamed: 0,features,importance
33,ROI: 0.25 µm per pixel: Hematoxylin: Mean,0.043437
34,ROI: 0.25 µm per pixel: Hematoxylin: Median,0.041947
37,ROI: 0.25 µm per pixel: Red: Mean,0.038536
36,ROI: 0.25 µm per pixel: Red: Max,0.036822
19,ROI: 0.25 µm per pixel: DAB: Haralick Sum aver...,0.034232


**Tau classifier for Striatum**

In [4]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Pre-trained/Models/"
filename = "str_classifier.sav"
str_model = joblib.load(path+filename)

In [5]:
print(str_model.best_parameters)
str_model.f_importance.head()

{0: (0.19564858129927834, 0.8870751195750726, 0.873128875762111, 0.9072192513368984), 1: (0.8264912975852534, 0.9777777777777779, 1.0, 0.96), 2: (0.7613745234867089, 0.991988553025763, 0.9920257276937077, 0.9919779041965537), 3: (0.45007735002598476, 0.9372955624546812, 0.914380764163373, 0.9650000000000001)}


Unnamed: 0,features,importance
0,Area µm^2,0.214311
4,Min diameter µm,0.181223
19,ROI: 0.25 µm per pixel: DAB: Mean,0.168029
20,ROI: 0.25 µm per pixel: DAB: Median,0.113468
16,ROI: 0.25 µm per pixel: DAB: Haralick Sum aver...,0.079985


**Tau classifier for Subthalamic nucleus & globus pallidus**

In [6]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Pre-trained/Models/"
filename = "stn_gp_classifier.sav"
stn_gp_model = joblib.load(path+filename)

In [7]:
print(stn_gp_model.best_parameters)
stn_gp_model.f_importance.head()

{0: (0.18845399342448405, 0.8946962279425845, 0.8965841716432827, 0.9002459016393443), 1: (0.7187865615771717, 0.9529564106344293, 0.9518181818181819, 0.9588888888888889), 2: (0.774700336310272, 0.9956218336556081, 0.9939498939499222, 0.9973053231655025)}


Unnamed: 0,features,importance
0,Area µm^2,0.510403
4,Min diameter µm,0.19324
21,ROI: 0.25 µm per pixel: DAB: Mean,0.074044
3,Max diameter µm,0.031333
33,Solidity,0.028362


### Putting them together: tau classification pipeline for novel *cortical* slides

In [8]:
novel_path = '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Detections/BG/'
novel_filename = 'detections.txt'
prediction_path = "C:/Users/mokur/OneDrive/Desktop/Digital_path/Predictions/BG/"

**Reading in files**

In [9]:
with open(novel_path + novel_filename) as f:
    mylist = f.read().splitlines()

print("Read in: ", len(mylist), "files")

Read in:  27 files


#### Tau classification

In [10]:
code = {'GP':'GP',
        'STN':'STN',
        'STR':'STR',
        'GP_reseg':'GP',
        'STN_reseg':'STN',
        'STR_reseg':'STR'}

In [11]:
n_total = len(mylist)
faulty_file = []
for i in range(0, n_total):

    # Read in novel slide
    print("FILE", mylist[i], "Number: ", i + 1, "/", n_total)
    print("---------------STEP1: Read in data file -------------------")
    dat_file = mylist[i]

    dat_ = pd.read_csv(novel_path + dat_file, sep="\t")

    # Fixing order of the columns
    ordered = dat_[extracted_features]

    # Changing column names
    # since these names tend to be inconsistent causing problems
    ordered.columns.values[5] = "Centroid_X"
    ordered.columns.values[6] = "Centroid_Y"

    dat_ordered = ordered[ordered["Class"] == "Unlabelled"]  # only unlabelled cells
    dat_parent = dat_ordered['Parent']
    dat = dat_ordered.copy()
    dat.loc[:, 'Parent'] = [code[i] for i in dat_parent]
    print(set(dat['Parent']))
    print("Read in data file:", dat_file)
    print("Data shape is:", dat.shape)

    # Classifier 1: separating Non-tau from Tau
    print(
        "---------------STEP2: Separating Non-tau from Tau -------------------"
    )
    # 1) Remove NA cells
    dat = dat.dropna()

    predicted1_slide = dat.copy()

    # 2) Dropping extra info features
    X_unlabelled = dat.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    # 3) Predictions
    screening_model.predict(X_unlabelled)
    predicted1_slide["Class"] = screening_model.prediction
    print(predicted1_slide["Class"].value_counts())

    # Classifier 2: tau hallmark classification
    print(
        "---------------STEP3: Tau hallmark classification -------------------"
    )
    # Select out 'tau' portion only (ignoring non-tau & ambiguous cells )
    tau_portion = predicted1_slide[predicted1_slide["Class"] == "Tau"]
    if tau_portion.shape[0] == 0:
        print("There is no tau on this slide!")
        faulty_file.append(dat["Image"][0] + " No tau on the slide")
        continue
        
    # Split BG into STR and GP & STN
    
    tau_portion_STR = tau_portion[tau_portion['Parent']=='STR']
    print('Tau on STR: ', tau_portion_STR.shape[0])
    
    if tau_portion_STR.shape[0] == 0:
        print("THERE IS NO TAU ON THIS SLIDE")
        faulty_file.append(dat["Image"][0] + " no tau on the slide")
        continue
    
    tau_portion_STN_GP = tau_portion[(tau_portion['Parent']=='STN')|(tau_portion['Parent']=='GP')]
    print('Tau on STN & GP: ', tau_portion_STN_GP.shape[0])
    
    if tau_portion_STN_GP.shape[0] == 0:
        print("THERE IS NO TAU ON THIS SLIDE")
        faulty_file.append(dat["Image"][0] + " no tau on the slide")
        continue
    
    # STR prediction
    tau_portion_STR_X = tau_portion_STR.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    predicted2_slide_STR = tau_portion_STR.copy()


    # 1) Get class probability predictions for 'test' data
    str_model.predict(tau_portion_STR_X)

    predicted2_slide_STR["Class"] = str_model.prediction
    print('STR predicted slide shape: ',predicted2_slide_STR["Class"].value_counts())
    
    # GP&STN 
    tau_portion_STN_GP_X = tau_portion_STN_GP.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    predicted2_slide_STN_GP = tau_portion_STN_GP.copy()


    # 1) Get class probability predictions for 'test' data
    stn_gp_model.predict(tau_portion_STN_GP_X)

    predicted2_slide_STN_GP["Class"] = stn_gp_model.prediction
    print('STN_GP predicted slide shape: ',predicted2_slide_STN_GP["Class"].value_counts())
    
    print("---------------STEP4: Data extraction & export -------------------")
# 1) Combining predicted cells & excluded cells (prior to prediction)
    
    #get non-tau portion
    non_tau_portion = predicted1_slide[predicted1_slide["Class"] != "Tau"]
    
    #put everything together: nontau, tau in STR, GP & STN
    total_pred = pd.concat([non_tau_portion,
                            predicted2_slide_STR,
                            predicted2_slide_STN_GP])
    
    print("No loss of cells? ", predicted1_slide.shape[0] == total_pred.shape[0])

    output_visualise = total_pred[["Image",
                                   "Name",
                                   "Parent",
                                   "Class",
                                   "Centroid_X",
                                   "Centroid_Y",
                                   "Area µm^2"]]
    path_ = (prediction_path +
             output_visualise.iloc[0, 0] +
             "_predictions.txt")
    output_visualise.to_csv(path_, sep="\t", index=False)

    print("Exported prediction of : ", dat_file)
    print("---------------------------------------------------")
print("Well done, no error!")

FILE 703488.svs Detections.txt Number:  1 / 27
---------------STEP1: Read in data file -------------------
{'GP', 'STN', 'STR'}
Read in data file: 703488.svs Detections.txt
Data shape is: (127406, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    80218
Tau        47188
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Tau on STR:  7782
Tau on STN & GP:  39406
STR predicted slide shape:  Others       5890
CB           1505
TA            213
Ambiguous     167
NFT             7
Name: Class, dtype: int64
STN_GP predicted slide shape:  Others       35646
CB            3457
Ambiguous      236
NFT             67
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  703488.svs Detections.txt
---------------------------------------------------
FILE 721708.svs Detections.txt Number:  2 / 27
---------------STEP1: Read in data fil