### Occipital classifier on novel slides

In [9]:
# load important libraries
import sys
sys.path.insert(0,
                '/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Cell_pipeline/Cell_classification/')
from base import *
from constants import *
import joblib 

Load cortical classifier

In [10]:
path = "/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Cell_pipeline/Cell_classification/Models/"
filename = "occipital_cell_classifier.sav"
model = joblib.load(path+filename)

In [11]:
model.pipeline

Pipeline(steps=[('normalizer', MinMaxScaler()),
                ('selector',
                 RFE(estimator=RandomForestClassifier(random_state=42),
                     n_features_to_select=28)),
                ('clf',
                 BalancedRandomForestClassifier(class_weight='balanced',
                                                max_depth=15, max_features=0.4,
                                                max_samples=0.25,
                                                min_samples_leaf=4,
                                                min_samples_split=10,
                                                n_estimators=200,
                                                random_state=42))])

In [12]:
model.best_parameters

{0: (0.46755140108518506,
  0.64622061572346,
  0.5524263118410125,
  0.809090909090909),
 1: (0.35800301310120014,
  0.8404223852097499,
  0.8257465256800274,
  0.8658687943262411),
 2: (0.38978088189514104,
  0.8427582788906303,
  0.8476453077212108,
  0.848989898989899),
 3: (0.3327053512581415,
  0.833597139312819,
  0.8379277250873924,
  0.8324248777078965)}

In [13]:
model.f_importance.head()

Unnamed: 0,features,importance
1,Nucleus: Area µm^2,0.132467
4,Nucleus: Max diameter µm,0.124573
0,Detection probability,0.117742
2,Nucleus: Length µm,0.117511
5,Nucleus: Min diameter µm,0.079836


Import files to make predictions on

In [14]:
# Data file 
with open("C:/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Cell_pipeline/Metadata/occipital_stardist.txt") as f: 
    mylist= f.read().splitlines()
    
print("Read in:",len(mylist)," files")

# NB files 
nb_mylist = [i[0:6]+'_all_neighbours.csv' for i in  mylist]

# # Hema files 
# hema_mylist = [i[0:6]+'_hema.csv' for i in  mylist]

print("Generated filename list for nb:", len(nb_mylist), "nb files")
print("Equal number of files?:", (len(mylist)==len(nb_mylist)))

# print("Generated filename list for hema:", len(hema_mylist), "hema files")
# print("Equal number of files?:", (len(mylist)==len(nb_mylist))==(len(mylist)==len(hema_mylist)))

Read in: 40  files
Generated filename list for nb: 40 nb files
Equal number of files?: True


747384 - cannot locate

In [18]:
i

18

In [19]:
mylist = mylist[19:]
nb_mylist = nb_mylist[19:]

Process new files to make predictions

In [21]:
n_total = len(mylist)
faulty_file = []
for i in range(0,n_total):
    
    # Read in unlabelled/unannotated file
    print("FILE", mylist[i], "Number: ", i+1,"/",n_total)
    print("---------------STEP1: DATA FILE-------------------")
    dat_file = mylist[i]
    
    dat_orig = pd.read_csv('C:/Users/mokur/OneDrive - University of Cambridge/Attachments/Jan2023/Cell_pipeline/detection_stardist/'+dat_file,sep="\t")
    dat_orig.columns.values[5] = "Centroid_X"
    dat_orig.columns.values[6] = "Centroid_Y"
    dat_ = dat_orig[cell_extracted_features]
    dab_df = dat_orig[dab_features]
    dat = dat_.copy()
    dat.loc[:,'Class'] = dat.shape[0]*['Unlabelled']
    print("Read in data file:", dat_file)
    print("Data shape is:", dat.shape)
    print("----------------------------------------------------------------------------")
    
    # Importing NB cells
    print("---------------STEP2: NB FILE-------------------")
    nb_dat_file = nb_mylist[i]
    nb_dat_ = pd.read_csv('E:/number_of_neighbours/'+nb_dat_file,sep=",")
    nb_dat_ = nb_dat_.rename(columns={'X':'Centroid_X','Y':'Centroid_Y'})
    nb_dat = nb_dat_[nnb_extracted_features]
    
    print("Read in nb file:", nb_dat_file)
    print("nb_dat shape is:", nb_dat.shape)
    print("-----------------------------------------")
    
    ## Merge NB info to main data ***** From now on: Use 'combined'
    combined = dat.merge(nb_dat.drop_duplicates(),on=['Centroid_X','Centroid_Y'],how='inner',validate='1:1')
    print("Successfully combine nb cell counts to main data")
    print('Expected & Observed row matched?: ', dat.shape[0]==combined.shape[0])
    print('Expected & Observed columns matched?: ', (dat.shape[1]+nb_dat.shape[1]-2)==combined.shape[1])
    print("----------------------------------------------------------------------------")
    

    # Importing hema files 
    print("---------------STEP3: HEMA FILE-------------------")
    hema_dat = dat[['Centroid_X','Centroid_Y','Hematoxylin: Nucleus: Mean']]
    print("hema_dat shape is:", hema_dat.shape)
    print("----------------------------------------------------------------------------")
    
    ############################  Normalising hema files ############################ 
    print("---------------STEP4: HEMA NORM-------------------")
    
    # 1) Find cells need removing 
    hema_to_remove = find_hema_to_remove_slide(hema_dat)
    print("No. of cells with normalised Hema >1:",
          len(hema_to_remove),
            "from", len(hema_dat),"detected cells")

    # 2) Remove hema from the slides 
    retained, removed_, remove_log = remove_cell_hema_slide(combined,
                           hema_to_remove)
    
    print(dat_file,":",
           'To be removed = actually removed?',remove_log,
           ' &',removed_.shape[0],"cells removed")
    
    # label those cells we have removed as having Class = 'Excluded'
    removed = removed_.copy()
    removed.loc[:,'Class'] = ['Excluded']*removed.shape[0]

    print("----------------------------------------------------------------------------")

    ############################ Checking for NAs & Other bits  
    print("---------------STEP5: CHECKING NA & REGIONS-------------------")
    
    # 1) Selecting only GREY MATTER PORTION (since thresholding won't make sense in WM)
    if (retained[retained['Name']=='Grey_matter'].shape[0] == 0): 
        faulty_file.append(retained['Image'][0])
        continue  
    
    retained2 = retained[retained['Name']=='Grey_matter']  # only has GM now
    retained_not_GM = retained[retained['Name']!='Grey_matter'] # Class will be Unlabelled
    

    # 2) Remove NA cells 
    retained3 = retained2.dropna()
    NA_proportion = retained2[retained2.isna().any(axis=1)]
    print("Shape of data, ready for prediction", retained3.shape)


    print("----------------------------------------------------------------------------")
 
    ############################ Prediction
    print("---------------STEP7: PREDICTIONS-------------------")
    
    # Create new variable for retained to add Class predictions to
    retained_final = retained3.copy()

    # Dropping extra info features 
    X_unlabelled = retained3.drop(columns=['Image',
                                           'Name',
                                           'Class',
                                           'Parent',
                                           'ROI',
                                           'Centroid_X',
                                           'Centroid_Y'])
    print('X_unlabelled shape: ', X_unlabelled.shape)

        
    # 1) Perform prediction on the novel slide
    model.predict(X_unlabelled)
    retained_final.loc[:,'Class'] = model.prediction
    print(retained_final['Class'].value_counts())
    
    print("----------------------------------------------------------------------------")
    ############################ Extracting data out  ############################ 
    print("---------------STEP8: DATA EXTRACTION & EXPORT-------------------")

    # 1) Combining predicted cells & excluded cells (prior to prediction)
    output_slide = pd.concat([removed, # from hema
                              retained_not_GM, # non GM 
                              NA_proportion, # GM with NA 
                              retained_final]) # predicted GM portion
    
    # 2) Add DAB information 
    output_slide_dab = output_slide.merge(dab_df,on=['Centroid_X','Centroid_Y'])
    
    # 2) Checking input data == output data 
    print("Input data == output data?:", combined.shape[0]==output_slide_dab.shape[0])

    # 3) Checking if there are NA values in predicted portion after combining data 
    pred_complete=output_slide_dab[output_slide_dab['Class']!='Unlabelled'].isna().sum().sum()
    print("No NAN values in predicted portion?: ", 0==pred_complete)
    
    # 4) Exporting relevant information 
    path = 'C:/Users/mokur/OneDrive/Desktop/Digital_path/Cell_pipeline/Predictions/Occipital/' + output_slide_dab.iloc[0,0]+'_predictions.txt'
    output_slide_dab.to_csv(path, sep='\t',index=False)
    print("Exported prediction of : ",dat_file)
    print("----------------------------------------------------------------------------")

print("**********ALL DONE! YAY, no error!***********")

FILE 747380_all.txt Number:  1 / 21
---------------STEP1: DATA FILE-------------------
Read in data file: 747380_all.txt
Data shape is: (288176, 41)
----------------------------------------------------------------------------
---------------STEP2: NB FILE-------------------
Read in nb file: 747380_all_neighbours.csv
nb_dat shape is: (288176, 12)
-----------------------------------------
Successfully combine nb cell counts to main data
Expected & Observed row matched?:  True
Expected & Observed columns matched?:  True
----------------------------------------------------------------------------
---------------STEP3: HEMA FILE-------------------
hema_dat shape is: (288176, 3)
----------------------------------------------------------------------------
---------------STEP4: HEMA NORM-------------------
No. of cells with normalised Hema >1: 2882 from 288176 detected cells
747380_all.txt : To be removed = actually removed? True  & 2882 cells removed
------------------------------------------