## Example script: training tau classifier for cortical regions

**Read in relevant files**

In [1]:
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/')

from base import *
from constants import *
from tau_classification import *
import joblib

### Load pre-trained models

**Screening classifier**

In [2]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Pre-trained/Models/"
filename = "screening_classifier_updated3.sav"
screening_model = joblib.load(path+filename)

In [3]:
# Some checks
print(screening_model.pipeline)
screening_model.f_importance.head()

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=46)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_features=1,
                                                min_samples_leaf=2,
                                                n_estimators=600,
                                                random_state=42))])


Unnamed: 0,features,importance
33,ROI: 0.25 µm per pixel: Hematoxylin: Mean,0.043437
34,ROI: 0.25 µm per pixel: Hematoxylin: Median,0.041947
37,ROI: 0.25 µm per pixel: Red: Mean,0.038536
36,ROI: 0.25 µm per pixel: Red: Max,0.036822
19,ROI: 0.25 µm per pixel: DAB: Haralick Sum aver...,0.034232


**Tau classifier for cortical regions**

In [4]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Pre-trained/Models/"
filename = "cortical_classifier_updated3.sav"
cortical_model = joblib.load(path+filename)

In [5]:
print(cortical_model.best_parameters)
cortical_model.f_importance.head()

{0: (0.20178264912940796, 0.9461222841732907, 0.9392187191317515, 0.9545454545454545), 1: (0.6822153285884559, 0.9098161065987153, 0.9409706959706959, 0.8903846153846153), 2: (0.7611313612515783, 0.9922817367544002, 0.9921489382138381, 0.9924504542672882), 3: (0.5047927492593813, 0.9566895472267282, 0.9541859991859992, 0.960923076923077)}


Unnamed: 0,features,importance
4,Min diameter µm,0.129815
0,Area µm^2,0.129211
2,Length µm,0.091414
3,Max diameter µm,0.067562
20,ROI: 0.25 µm per pixel: DAB: Median,0.049342


### Putting them together: tau classification pipeline for novel *cortical* slides

In [6]:
novel_path = '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Detections/cortical/'
novel_filename = 'detections.txt'
prediction_path = "C:/Users/mokur/OneDrive/Desktop/Digital_path/Predictions/Cortical/"

**Reading in files**

In [7]:
with open(novel_path + novel_filename) as f:
    mylist = f.read().splitlines()

print("Read in: ", len(mylist), "files")

Read in:  222 files


#### Tau classification

In [8]:
n_total = len(mylist)
faulty_file = []
for i in range(0, n_total):

    # Read in novel slide
    print("FILE", mylist[i], "Number: ", i + 1, "/", n_total)
    print("---------------STEP1: Read in data file -------------------")
    dat_file = mylist[i]

    dat_ = pd.read_csv(novel_path + dat_file, sep="\t")

    # Fixing order of the columns
    ordered = dat_[extracted_features]

    # Changing column names
    # since these names tend to be inconsistent causing problems
    ordered.columns.values[5] = "Centroid_X"
    ordered.columns.values[6] = "Centroid_Y"



    dat = ordered[ordered["Class"] == "Unlabelled"]  # only unlabelled cells
    print("Read in data file:", dat_file)
    print("Data shape is:", dat.shape)

    # Classifier 1: separating Non-tau from Tau
    print(
        "---------------STEP2: Separating Non-tau from Tau -------------------"
    )
    # 1) Remove NA cells
    dat = dat.dropna()

    predicted1_slide = dat.copy()

    # 2) Dropping extra info features
    X_unlabelled = dat.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    # 3) Predictions
    screening_model.predict(X_unlabelled)
    predicted1_slide["Class"] = screening_model.prediction
    print(predicted1_slide["Class"].value_counts())

    # Classifier 2: tau hallmark classification
    print(
        "---------------STEP3: Tau hallmark classification -------------------"
    )
    # Select out 'tau' portion only (ignoring non-tau & ambiguous cells )
    tau_portion = predicted1_slide[predicted1_slide["Class"] == "Tau"]
    if tau_portion.shape[0] == 0:
        print("There is no tau on this slide!")
        faulty_file.append(dat["Image"][0] + " No tau on the slide")
        continue
    tau_portion_X = tau_portion.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"])
    predicted2_slide = tau_portion.copy()
    # Separates out detections that are not tau
    non_tau_portion = predicted1_slide[predicted1_slide["Class"] != "Tau"]

    # Make predictions on Tau objects
    cortical_model.predict(tau_portion_X)
    predicted2_slide["Class"] = cortical_model.prediction
    print(predicted2_slide["Class"].value_counts())

    # Extracting data out
    print("---------------STEP4: Data extraction & export -------------------")

    # 1) Combining predicted cells & excluded cells (prior to prediction)
    total_pred = pd.concat([non_tau_portion,
                            predicted2_slide])
    print("No loss of cells? ",
          predicted1_slide.shape[0] == total_pred.shape[0])

    output_visualise = total_pred[["Image",
                                   "Class",
                                   "Centroid_X",
                                   "Centroid_Y",
                                   "Area µm^2"]]
    path_ = (
            prediction_path +
            output_visualise.iloc[0, 0] +
            "_predictions.txt"
            )
    output_visualise.to_csv(path_, sep="\t", index=False)

    print("Exported prediction of : ", dat_file)
    print("---------------------------------------------------")
print("Well done, no error!")

FILE 703471.svs Detections.txt Number:  1 / 222
---------------STEP1: Read in data file -------------------
Read in data file: 703471.svs Detections.txt
Data shape is: (41670, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    35077
Tau         6593
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       4576
CB           1707
Ambiguous     137
NFT            96
TA             77
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  703471.svs Detections.txt
---------------------------------------------------
FILE 703472.svs Detections.txt Number:  2 / 222
---------------STEP1: Read in data file -------------------
Read in data file: 703472.svs Detections.txt
Data shape is: (80333, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    79235
Tau         1098
Name: Class, dtype: int64