## Validate and Test Random Forest Classifier

In [None]:
# load third-party Python modules
import javabridge
import bioformats as bf
import skimage
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import os
import sys
import h5py
sys.path.append('../../..')

javabridge.start_vm(class_path=bf.JARS)

In [None]:
hf = h5py.File('trainset.h5', 'r')
ytrain = np.asarray(hf.get('final/ytrain'))
Xtrain = np.asarray(hf.get('final/Xtrain'))
columns = np.asarray(hf.get('final/columns'))
Xtrain_df = pd.DataFrame(Xtrain, columns=columns)
hf.close()

In [None]:
from base.utils import load_imgstack
from segment.tools import read_bbox
from sklearn.preprocessing import label_binarize

def get_train_instance(path, fname, pad=0):
    imgstack = load_imgstack(fname=os.path.join(path, fname + ".png"),
                            verbose=False)
    img = np.squeeze(imgstack)
    df = pd.read_csv(os.path.join(path, fname + ".csv"))
    rmax, cmax, _ = img.shape
    bbox = read_bbox(df=df, rmax=rmax,
                     cmax=cmax, pad=pad)
    return img, bbox

def make_labels(arr, label=1):
    return np.vstack((arr, label * np.ones(arr.shape, dtype=np.int))).T

In [None]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
path = '../../data/AML_trainset/test'

In [None]:
sys.path.append('../../../../')
from bioimg.classify import ImgX, IncrementalClassifier

In [None]:
clf_incr = IncrementalClassifier()
clf_incr.ytrain = label_binarize(ytrain, classes=range(3))
clf_incr.Xtrain = Xtrain_df
clf_incr.classes = ['apoptotic', 'viable', 'other']

In [None]:
clf_incr.set_classifier().train_classifier()

In [None]:
imgs = [f.replace('.csv', '') for f in os.listdir(path) if '.csv' in f]

In [None]:
len(imgs)

In [None]:
rmax, cmax = (720,720)

In [None]:
test_gtruth = pd.read_csv('../../data/AML_trainset/test_labels.csv')

In [None]:
from collections import namedtuple
Box = namedtuple('Box', 'xmin xmax ymin ymax')

def area_overlap(a, b):  # returns None if rectangles don't intersect
    dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin)
    dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin)
    if (dx>=0) and (dy>=0):
        return dx*dy

In [None]:
i = 0
img, bbox = get_train_instance(path=path,
                          fname=imgs[i], pad=20)
bbox_gt = read_bbox(df=test_gtruth[test_gtruth.filename ==''.join([imgs[i], '.png'])], 
             columns=['ymin','xmin','ymax','xmax'],
             rmax=720,
             cmax=720, pad=0)

In [None]:
# initialize 'ImgX' class
imgx = ImgX(img=img, bbox=bbox,
            n_chan=['Lysosomal', 'Calcein', 'Hoechst'])
clf_incr.imgx = imgx

In [None]:
clf_incr.generate_predictions()
probs = clf_incr.clf.predict_proba(imgx.data)

In [None]:
df = pd.read_csv(os.path.join(path, imgs[i] + ".csv"))
bbox_nopad = read_bbox(df=df, rmax=rmax, cmax=cmax, pad=0)

In [None]:
y_gt = test_gtruth[test_gtruth.filename ==''.join([imgs[i], '.png'])]['class'].values
y_gt[y_gt=='apoptotic AML'] = 0
y_gt[y_gt=='viable AML'] = 1

df_list = []
for bt,y in zip(bbox_gt, y_gt):
    for b, b_nopad in zip(bbox, bbox_nopad):
        A_common = area_overlap(Box(*bt), Box(*b_nopad))
        if A_common > 200:
            label_df = pd.DataFrame(data=b_nopad[None,...],
                                    columns=['bbox-1', 'bbox-3', 'bbox-0', 'bbox-2'])
            label_df['y'] = y
            label_df['Acom'] = A_common
            df_list.append(label_df)
if(len(df_list)):
    df_unique = (pd.concat(df_list).
                 groupby(by=['bbox-0', 'bbox-1', 'bbox-2', 'bbox-3']).
                 apply(lambda x: x.loc[x.Acom == x.Acom.max(),:]).
                 reset_index(drop=True))
    img_df = pd.merge(left=df, right=df_unique,
         on=['bbox-0','bbox-1','bbox-2','bbox-3'], how='left')
    img_df = img_df[['bbox-0','bbox-1','bbox-2','bbox-3','y']]
    img_df = img_df.fillna(2)
    img_df = img_df.astype(int)

In [None]:
img_df['ypred'] = imgx.y
img_df['p_apoptotic'] = probs[:,0]
img_df['p_viable'] = probs[:,1]
img_df['p_other'] = probs[:,2]
img_df

In [None]:
from extra.viz import plotly_predictions
from plotly.offline import iplot

In [None]:
layout, feats = plotly_predictions(img,bb=bbox,
                   ypred=img_df['ypred'].values,
                   labels=['apoptotic AML',
                          'viable AML', 'other'])

In [None]:
#iplot(dict(data=feats, layout=layout))

In [None]:
pred_df = []
for i in range(len(imgs)):
    img, bbox = get_train_instance(path=path,
                          fname=imgs[i], pad=20)
    bbox_gt = read_bbox(df=test_gtruth[test_gtruth.filename ==''.join([imgs[i], '.png'])], 
                 columns=['ymin','xmin','ymax','xmax'],
                 rmax=720,
                 cmax=720, pad=0)
    # initialize 'ImgX' class
    imgx = ImgX(img=img, bbox=bbox,
                n_chan=['Lysosomal', 'Calcein', 'Hoechst'])
    clf_incr.imgx = imgx
    
    clf_incr.generate_predictions()
    probs = clf_incr.clf.predict_proba(imgx.data)
    df = pd.read_csv(os.path.join(path, imgs[i] + ".csv"))
    bbox_nopad = read_bbox(df=df, rmax=rmax, cmax=cmax, pad=0)
    y_gt = test_gtruth[test_gtruth.filename ==''.join([imgs[i], '.png'])]['class'].values
    y_gt[y_gt=='apoptotic AML'] = 0
    y_gt[y_gt=='viable AML'] = 1

    df_list = []
    for bt,y in zip(bbox_gt, y_gt):
        for b, b_nopad in zip(bbox, bbox_nopad):
            A_common = area_overlap(Box(*bt), Box(*b_nopad))
            if A_common > 200:
                label_df = pd.DataFrame(data=b_nopad[None,...],
                                        columns=['bbox-1', 'bbox-3', 'bbox-0', 'bbox-2'])
                label_df['y'] = y
                label_df['Acom'] = A_common
                df_list.append(label_df)
    if(len(df_list)):
        df_unique = (pd.concat(df_list).
                     groupby(by=['bbox-0', 'bbox-1', 'bbox-2', 'bbox-3']).
                     apply(lambda x: x.loc[x.Acom == x.Acom.max(),:]).
                     reset_index(drop=True))
        img_df = pd.merge(left=df, right=df_unique,
             on=['bbox-0','bbox-1','bbox-2','bbox-3'], how='left')
        img_df = img_df[['bbox-0','bbox-1','bbox-2','bbox-3','y']]
        img_df = img_df.fillna(2)
        img_df = img_df.astype(int)
        img_df['ypred'] = imgx.y
        img_df['p_apoptotic'] = probs[:,0]
        img_df['p_viable'] = probs[:,1]
        img_df['p_other'] = probs[:,2]
        pred_df.append(img_df)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
pred_test = pd.concat(pred_df)

In [None]:
pred_test.head()

In [None]:
confusion_matrix(y_pred=pred_test['ypred'], y_true=pred_test['y'])

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import label_binarize

In [None]:
y_test = label_binarize(pred_test['y'].values, [0,1,2])
probas_ = pred_test[['p_apoptotic', 'p_viable', 'p_other']].values

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
precision = dict()
recall = dict()
avprec = dict()
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], probas_[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
                                                        probas_[:, i])
    avprec[i] = average_precision_score(y_test[:, i], probas_[:, i])

In [None]:
import matplotlib
font = {'family' : 'normal',
        'size'   : 14}

matplotlib.rc('font', **font)

In [None]:
from itertools import cycle
colors = cycle(['#27496d','#63b7af', '#dae1e7'])
class_names = ['Apoptotic AML', 'Viable AML', 'Other']

plt.figure(figsize=(7, 7))
f_scores = np.linspace(0.5, 0.9, num=5)

for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y_ = f_score * x / (2 * x - f_score)
    plt.plot(x[y_ >= 0], y_[y_ >= 0], color='gray', alpha=0.2)
    plt.annotate('F1={0:0.1f}'.format(f_score), xy=(0.9, y_[45] + 0.02))
for i, color in zip(range(3), colors):
    plt.plot(recall[i], precision[i], color=color, lw=4,
             label='{0} (AP = {1:0.2f})'
             ''.format(class_names[i], avprec[i]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall Random Forest')
plt.legend(loc="lower left")
plt.savefig('PR-randomforest.pdf')

In [None]:
plt.figure(figsize=(7, 7))
for i, color in zip(range(3), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=4,
             label='{0} (AUCROC = {1:0.2f})'
             ''.format(class_names[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Random forest')
plt.legend(loc="lower right")
plt.savefig('ROC-randomforest.pdf')