## Example script: training tau classifier for dentate nucleus regions

**Read in relevant files**

In [6]:
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/')

from base import *
from constants import *
from tau_classification import * 

### Data preparation

**Screening classifier**

In [2]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Training_data/screening_classifier/"
filename = "training.txt"

# Create tau database object 
s_data = TauDataBase(path = path,
                     filename = filename) 

# Prepping data to train screening classifier
s_data.classifier1_prep()

# Check data
print(s_data.c1_data['Class'].value_counts())
print(s_data.c1_X_train.shape)

Tau        10185
Non_tau     9963
Name: Class, dtype: int64
(20148, 54)


**Tau classifier for dentate nucleus**

In [7]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Tau_pipeline/Tau_classification/Untrained/Training_data/DN/"
filename = "training.txt"

# Create tau database object 
dn_data = TauDataBase(path = path,
                      filename = filename) 

# Prepping data to train tau classifier for dn regions
dn_data.classifier2_prep()

# Check data
print(dn_data.c2_data['Class'].value_counts())
print(dn_data.c2_X_train.shape)

Others    1805
NFT        234
CB         147
Name: Class, dtype: int64
(2186, 54)


### Initialising & training the classifiers

**Screening classifier**

In [4]:
screening_model = ScreeningClassifier(hyperparameters=screening_classifier_hyperparams)
screening_model.pipeline

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=44)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_features=1,
                                                max_samples=0.75,
                                                min_samples_leaf=2,
                                                n_estimators=300,
                                                random_state=42,
                                                sampling_strategy='not '
                                                                  'majority'))])

In [5]:
# Training 
screening_model.train(X=s_data.c1_X_train,
                      Y=s_data.c1_Y_train)

In [6]:
screening_model.f_importance.head()

Unnamed: 0,features,importance
20,ROI: 0.25 µm per pixel: DAB: Max,0.053625
16,ROI: 0.25 µm per pixel: DAB: Haralick Sum aver...,0.04833
26,ROI: 0.25 µm per pixel: Green: Mean,0.041939
21,ROI: 0.25 µm per pixel: DAB: Mean,0.041595
35,ROI: 0.25 µm per pixel: Red: Mean,0.040228


**Tau classifier for dentate nucleus**

In [10]:
dn_model = TauClassifierNoTA(hyperparameters=dn_classifier_hyperparams)
dn_model.pipeline

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=34)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_features=0.2,
                                                random_state=42,
                                                sampling_strategy='not '
                                                                  'majority'))])

In [11]:
# Training 
dn_model.train(X=dn_data.c2_X_train,
               Y=dn_data.c2_Y_train)

In [12]:
dn_model.best_parameters

{0: (0.409, 0.9144010858386309, 0.9031204850361197, 0.9314285714285713),
 1: (0.422, 0.9811932262266609, 0.9714598662207358, 0.9916666666666668),
 2: (0.6599999999999999,
  0.9944717736676886,
  0.9934360958126922,
  0.9955770411295273)}

In [13]:
dn_model.f_importance.head()

Unnamed: 0,features,importance
0,Area µm^2,0.216869
4,Min diameter µm,0.207962
3,Max diameter µm,0.058139
2,Length µm,0.056409
19,ROI: 0.25 µm per pixel: DAB: Haralick Sum entr...,0.040231


In [14]:
import joblib
joblib.dump(dn_model, 'dn_classifier.sav')

['dn_classifier.sav']

### Putting them together: tau classification pipeline for novel *dentate nucleus* slides

In [15]:
novel_path = '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Detections/DN/'
novel_filename = 'detections.txt'
prediction_path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Predictions_new/DN/"

**Reading in files**

In [16]:
with open(novel_path + novel_filename) as f:
    mylist = f.read().splitlines()

print("Read in: ", len(mylist), "files")

#### Tau classification

In [18]:
n_total = len(mylist)
faulty_file = []
for i in range(0, n_total):

    # Read in novel slide
    print("FILE", mylist[i], "Number: ", i + 1, "/", n_total)
    print("---------------STEP1: Read in data file -------------------")
    dat_file = mylist[i]

    dat_ = pd.read_csv(novel_path + dat_file, sep="\t")

    # Fixing order of the columns
    ordered = dat_[extracted_features]

    # Changing column names
    # since these names tend to be inconsistent causing problems
    ordered.columns.values[5] = "Centroid_X"
    ordered.columns.values[6] = "Centroid_Y"

    dat = ordered[ordered["Class"] == "Unlabelled"]  # only unlabelled cells
    print("Read in data file:", dat_file)
    print("Data shape is:", dat.shape)

    # Classifier 1: separating Non-tau from Tau
    print(
        "---------------STEP2: Separating Non-tau from Tau -------------------"
    )
    # 1) Remove NA cells
    dat = dat.dropna()

    predicted1_slide = dat.copy()

    # 2) Dropping extra info features
    X_unlabelled = dat.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"]
    )
    # 3) Predictions
    screening_model.predict(X_unlabelled)
    predicted1_slide["Class"] = screening_model.prediction
    print(predicted1_slide["Class"].value_counts())

    # Classifier 2: tau hallmark classification
    print(
        "---------------STEP3: Tau hallmark classification -------------------"
    )
    # Select out 'tau' portion only (ignoring non-tau & ambiguous cells )
    tau_portion = predicted1_slide[predicted1_slide["Class"] == "Tau"]
    if tau_portion.shape[0] == 0:
        print("There is no tau on this slide!")
        faulty_file.append(dat["Image"][0] + " No tau on the slide")
        continue
    tau_portion_X = tau_portion.drop(
        columns=["Image",
                 "Name",
                 "Class",
                 "Parent",
                 "ROI",
                 "Centroid_X",
                 "Centroid_Y"])
    predicted2_slide = tau_portion.copy()
    # Separates out detections that are not tau
    non_tau_portion = predicted1_slide[predicted1_slide["Class"] != "Tau"]

    # Make predictions on Tau objects
    dn_model.predict(tau_portion_X)
    predicted2_slide["Class"] = dn_model.prediction
    print(predicted2_slide["Class"].value_counts())

    # Extracting data out
    print("---------------STEP4: Data extraction & export -------------------")

    # 1) Combining predicted cells & excluded cells (prior to prediction)
    total_pred = pd.concat([non_tau_portion,
                            predicted2_slide])
    print("No loss of cells? ",
          predicted1_slide.shape[0] == total_pred.shape[0])

    output_visualise = total_pred[["Image",
                                   "Class",
                                   "Centroid_X",
                                   "Centroid_Y",
                                   "Area µm^2"]]
    path_ = (
            prediction_path +
            output_visualise.iloc[0, 0] +
            "_predictions.txt"
            )
    output_visualise.to_csv(path_, sep="\t", index=False)

    print("Exported prediction of : ", dat_file)
    print("---------------------------------------------------")
print("Well done, no error!")

FILE 721795.svs Detections.txt Number:  1 / 1
---------------STEP1: Read in data file -------------------
Read in data file: 721795.svs Detections.txt
Data shape is: (6006, 61)
---------------STEP2: Separating Non-tau from Tau -------------------
Tau        3911
Non_tau    2095
Name: Class, dtype: int64
---------------STEP3: Tau hallmark classification -------------------
Others       3554
CB            223
NFT            85
Ambiguous      49
Name: Class, dtype: int64
---------------STEP4: Data extraction & export -------------------
No loss of cells?  True
Exported prediction of :  721795.svs Detections.txt
---------------------------------------------------
Well done, no error!
