## Example script: training tau classifier for cortical regions

**Read in relevant files**

In [8]:
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/')

from base import *
from constants import *
from tau_classification import * 
import joblib

### Data preparation

**Screening classifier**

In [10]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Untrained/Training_data/screening_classifier/"
filename = "training.txt"

# Create tau database object 
s_data = TauDataBase(path = path,
                     filename = filename) 

# Prepping data to train screening classifier
s_data.classifier1_prep()

# Check data
print(s_data.c1_data['Class'].value_counts())
print(s_data.c1_X_train.shape)

Non_tau    12006
Tau         9827
Name: Class, dtype: int64
(21833, 54)


**Tau classifier for cortical regions**

In [2]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Untrained/Training_data/cortical_regions/"
filename = "training.txt"

# Create tau database object 
cortical_data = TauDataBase(path = path,
                            filename = filename) 

# Prepping data to train tau classifier for cortical regions
cortical_data.classifier2_prep()

# Check data
print(cortical_data.c2_data['Class'].value_counts())
print(cortical_data.c2_X_train.shape)

Others    2913
CB         661
TA         254
NFT        126
Name: Class, dtype: int64
(3954, 54)


### Initialising & training the classifiers

**Screening classifier**

In [11]:
screening_model = ScreeningClassifier(hyperparameters=screening_classifier_hyperparams)
screening_model.pipeline

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=46)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_features=1,
                                                min_samples_leaf=2,
                                                n_estimators=600,
                                                random_state=42))])

In [12]:
# Training 
screening_model.train(X=s_data.c1_X_train,
                      Y=s_data.c1_Y_train)

In [13]:
screening_model.f_importance.head()

Unnamed: 0,features,importance
33,ROI: 0.25 µm per pixel: Hematoxylin: Mean,0.043437
34,ROI: 0.25 µm per pixel: Hematoxylin: Median,0.041947
37,ROI: 0.25 µm per pixel: Red: Mean,0.038536
36,ROI: 0.25 µm per pixel: Red: Max,0.036822
19,ROI: 0.25 µm per pixel: DAB: Haralick Sum aver...,0.034232


In [14]:
# save the model
joblib.dump(screening_model, 'screening_classifier_updated3.sav')

['screening_classifier_updated.sav']

**Tau classifier for cortical regions**

In [3]:
cortical_model = TauClassifier(hyperparameters=cortical_classifier_hyperparams)
cortical_model.pipeline

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=40)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_depth=10, max_features=0.2,
                                                max_samples=0.75,
                                                n_estimators=800,
                                                random_state=42,
                                                sampling_strategy='not '
                                                                  'majority'))])

In [4]:
# Training 
cortical_model.train(X=cortical_data.c2_X_train,
                     Y=cortical_data.c2_Y_train)

In [5]:
cortical_model.best_parameters

{0: (0.20178264912940796,
  0.9461222841732907,
  0.9392187191317515,
  0.9545454545454545),
 1: (0.6822153285884559,
  0.9098161065987153,
  0.9409706959706959,
  0.8903846153846153),
 2: (0.7611313612515783,
  0.9922817367544002,
  0.9921489382138381,
  0.9924504542672882),
 3: (0.5047927492593813,
  0.9566895472267282,
  0.9541859991859992,
  0.960923076923077)}

In [6]:
cortical_model.f_importance.head()

Unnamed: 0,features,importance
4,Min diameter µm,0.129815
0,Area µm^2,0.129211
2,Length µm,0.091414
3,Max diameter µm,0.067562
20,ROI: 0.25 µm per pixel: DAB: Median,0.049342


In [7]:
# save the model
joblib.dump(cortical_model, 'cortical_classifier_updated3.sav')

['cortical_classifier_updated3.sav']

### Putting them together: tau classification pipeline for novel *cortical* slides

In [11]:
novel_path = '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Detections/cortical/'
novel_filename = 'detections.txt'
prediction_path = "C:\Users\mokur\OneDrive\Desktop\Digital_path\Predictions_cortical_update3"

**Reading in files**

In [12]:
with open(novel_path + novel_filename) as f:
    mylist = f.read().splitlines()

print("Read in: ", len(mylist), "files")

Read in:  62 files


#### Tau classification

In [13]:
n_total = len(mylist)
faulty_file = []
for i in range(0, n_total):

    # Read in novel slide
    print("FILE", mylist[i], "Number: ", i + 1, "/", n_total)
    print("---------------STEP1: Read in data file -------------------")
    dat_file = mylist[i]

    dat_ = pd.read_csv(novel_path + dat_file, sep="\t")

    # Fixing order of the columns
    ordered = dat_[extracted_features]

    # Changing column names
    # since these names tend to be inconsistent causing problems
    ordered.columns.values[5] = "Centroid_X"
    ordered.columns.values[6] = "Centroid_Y"

    dat = ordered[ordered["Class"] == "Unlabelled"]  # only unlabelled cells
    print("Read in data file:", dat_file)
    print("Data shape is:", dat.shape)

    # Classifier 1: separating Non-tau from Tau
    print(
        "---------------STEP2: Separating Non-tau from Tau -------------------"
    )
    # 1) Remove NA cells
    dat = dat.dropna()

    predicted1_slide = dat.copy()

    # 2) Dropping extra info features
    X_unlabelled = dat.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    # 3) Predictions
    screening_model.predict(X_unlabelled)
    predicted1_slide["Class"] = screening_model.prediction
    print(predicted1_slide["Class"].value_counts())

    # Classifier 2: tau hallmark classification
    print(
        "---------------STEP3: Tau hallmark classification -------------------"
    )
    # Select out 'tau' portion only (ignoring non-tau & ambiguous cells )
    tau_portion = predicted1_slide[predicted1_slide["Class"] == "Tau"]
    if tau_portion.shape[0] == 0:
        print("There is no tau on this slide!")
        faulty_file.append(dat["Image"][0] + " No tau on the slide")
        continue
    tau_portion_X = tau_portion.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"])
    predicted2_slide = tau_portion.copy()
    # Separates out detections that are not tau
    non_tau_portion = predicted1_slide[predicted1_slide["Class"] != "Tau"]

    # Make predictions on Tau objects
    cortical_model.predict(tau_portion_X)
    predicted2_slide["Class"] = cortical_model.prediction
    print(predicted2_slide["Class"].value_counts())

    # Extracting data out
    print("---------------STEP4: Data extraction & export -------------------")

    # 1) Combining predicted cells & excluded cells (prior to prediction)
    total_pred = pd.concat([non_tau_portion,
                            predicted2_slide])
    print("No loss of cells? ",
          predicted1_slide.shape[0] == total_pred.shape[0])

    output_visualise = total_pred[["Image",
                                   "Class",
                                   "Centroid_X",
                                   "Centroid_Y",
                                   "Area µm^2"]]
    path_ = (
            prediction_path +
            output_visualise.iloc[0, 0] +
            "_predictions.txt"
            )
    output_visualise.to_csv(path_, sep="\t", index=False)

    print("Exported prediction of : ", dat_file)
    print("---------------------------------------------------")
print("Well done, no error!")

FILE 703471.svs Detections.txt Number:  1 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 703471.svs Detections.txt
Data shape is: (41670, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    34189
Tau         7481
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       5192
CB           1783
Ambiguous     194
NFT           186
TA            126
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  703471.svs Detections.txt
---------------------------------------------------
FILE 703483.svs Detections.txt Number:  2 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 703483.svs Detections.txt
Data shape is: (32686, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Tau        17692
Non_tau    14994
Name: Class, dtype: int64
-

Read in data file: 747327.svs Detections.txt
Data shape is: (37601, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    25868
Tau        11733
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       11090
TA             243
CB             207
NFT             97
Ambiguous       96
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  747327.svs Detections.txt
---------------------------------------------------
FILE 747339.svs Detections.txt Number:  14 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 747339.svs Detections.txt
Data shape is: (24166, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    23384
Tau          782
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       704
CB            5

Read in data file: 755565.svs Detections.txt
Data shape is: (33753, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    25346
Tau         8407
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       7076
CB            540
TA            477
NFT           163
Ambiguous     151
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  755565.svs Detections.txt
---------------------------------------------------
FILE 755566.svs Detections.txt Number:  26 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 755566.svs Detections.txt
Data shape is: (26581, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    14201
Tau        12380
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       10607
TA             747

Read in data file: 760090.svs Detections.txt
Data shape is: (9135, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    8680
Tau         455
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       360
CB            74
NFT            9
Ambiguous      9
TA             3
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  760090.svs Detections.txt
---------------------------------------------------
FILE 760091.svs Detections.txt Number:  38 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 760091.svs Detections.txt
Data shape is: (27689, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    25654
Tau         2035
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       1444
CB            456
Ambiguous

Read in data file: 771793.svs Detections.txt
Data shape is: (9368, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    7736
Tau        1632
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       1226
CB            236
NFT            80
TA             46
Ambiguous      44
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  771793.svs Detections.txt
---------------------------------------------------
FILE 771805.svs Detections.txt Number:  50 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 771805.svs Detections.txt
Data shape is: (16215, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    13306
Tau         2909
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       1930
CB            418
NFT 

Read in data file: 771864.svs Detections.txt
Data shape is: (15061, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    9136
Tau        5925
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       4890
CB            660
TA            162
Ambiguous     145
NFT            68
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  771864.svs Detections.txt
---------------------------------------------------
FILE 772701.svs Detections.txt Number:  62 / 62
---------------STEP1: Read in data file -------------------
Read in data file: 772701.svs Detections.txt
Data shape is: (16038, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Non_tau    14268
Tau         1770
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       1579
CB            100
NFT