In [1]:
#import os
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
#os.environ["CUDA_VISIBLE_DEVICES"] = ""

from tabulate import tabulate
from scipy.interpolate import interp1d
import numpy as np
import math
from MAPScorer import MAPScorer
from scipy import interp
from myLSTMKfold import MyLSTM
from sklearn.metrics import roc_curve, precision_recall_curve, auc, roc_auc_score, precision_recall_fscore_support, f1_score, accuracy_score, cohen_kappa_score
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
outputPlotDir = "plots/lstmKfold"

In [3]:
lstm = MyLSTM()
lstm.loadData()
lstm.loadModels()

Load models


In [None]:
tasks = ['sede1', 'sede12', 'sede2ft3', 'morfo1', 'morfo2']
ftTasks = ['sede2ft3']
yp = {}
ycn = {}
yc = {}
ytn = {}
yt = {}
for fold in range(lstm.stratifications):
    print("============ fold {}".format(fold))
    yp[fold] = {}
    ycn[fold] = {}
    yc[fold] = {}
    ytn[fold] = {}
    yt[fold] = {}
    for task in tasks:
        print("-------- task {}".format(task))
        XTest = lstm.getXTest(fold, task)
        
        if not task in ftTasks:
            yp[fold][task] = lstm.model[fold][task].predict_proba(XTest)
            ycn[fold][task] = lstm.model[fold][task].predict_classes(XTest)
        else:
            XTestFT = lstm.getXTestFT(fold, task)
            yp[fold][task] = lstm.model[fold][task].predict_proba([XTestFT, XTest])
            ycn[fold][task] = lstm.model[fold][task].predict_classes([XTestFT, XTest])

        if not task in ftTasks:
            yTest = lstm.getYTest(fold, task)
            
            yt[fold][task] = yTest

            ytn[fold][task] = np.zeros_like(ycn[fold][task])
            for i,v in enumerate(yt[fold][task]):
                ytn[fold][task][i] = np.nonzero(yt[fold][task][i])[0][0]

        else:
            yt[fold][task] = yt[fold]['sede12']
            ytn[fold][task] = ytn[fold]['sede12']

        yc[fold][task] = np.zeros_like(yt[fold][task])
        for i,v in enumerate(ycn[fold][task]):
            yc[fold][task][i][v] = 1
   

-------- task sede1
-------- task sede12
-------- task sede2ft3
-------- task morfo1
-------- task sede1
-------- task sede12
-------- task morfo2
-------- task sede1
-------- task sede12
-------- task morfo2
-------- task sede1
-------- task sede2ft3
-------- task morfo1
-------- task morfo2
-------- task sede1
-------- task sede1
-------- task morfo2
-------- task sede1
-------- task sede2ft3
-------- task morfo1
-------- task sede1
-------- task morfo2
-------- task sede1
-------- task sede2ft3
-------- task morfo1
-------- task sede1
-------- task sede12
-------- task sede2ft3
-------- task morfo1

In [None]:
mapScorer = MAPScorer()
table = [["task", "average", "meanAvgPrec", "accuracy", "kappa", "precision", "recall", "f1score"]]
na = 'N/A'
metrics = {}

for task in tasks:
    print("======= task {}".format(task))
    table.append([" ", " ", " ", " ", " ", " ", " "])
    acuracy = 0
    
    metrics[task] = {}
    metrics[task][na] = {}
    metrics[task][na]['meanAvgPrec'] = {'exp':0.0, 'sd':0.0}
    metrics[task][na]['accuracy'] = {'exp':0.0, 'sd':0.0}
    metrics[task][na]['kappa'] = {'exp':0.0, 'sd':0.0}
    for avg in ['micro', 'macro', 'weighted']:
        metrics[task][avg] = {}
        metrics[task][avg]['precision'] = {'exp':0.0, 'sd':0.0}
        metrics[task][avg]['recall'] = {'exp':0.0, 'sd':0.0}
        metrics[task][avg]['f1score'] = {'exp':0.0, 'sd':0.0}

    for fold in range(lstm.stratifications):
        print("--- fold {}".format(fold))
        for curr in [
            ('meanAvgPrec', mapScorer.score(yt[fold][task], yp[fold][task])),
            ('accuracy', accuracy_score(yt[fold][task], yc[fold][task])),
            ('kappa', cohen_kappa_score(ytn[task], ycn[task]))
        ]:
            metrics[task][na][curr[0]]['exp'] += curr[1]
            metrics[task][na][curr[0]]['sd'] += curr[1] * curr[1]
        
        for avg in ['micro', 'macro', 'weighted']:
            tempPrec, tempRec, tempF1, _ = precision_recall_fscore_support(yt[task], yc[task], average=avg)
            for curr in [
                ('precision', tempPrec),
                ('recall', tempRec),
                ('f1score', tempF1)
            ]:
                metrics[task][avg][curr[0]]['exp'] += curr[1]
                metrics[task][avg][curr[0]]['sd'] += curr[1] * curr[1]
            
    
    for curr in ['meanAvgPrec', 'accuracy', 'kappa']:
        metrics[task][na][curr]['exp'] /= lstm.stratifications
        metrics[task][na][curr]['sd'] = math.sqrt((metrics[task][na][curr]['sd']/lstm.stratifications) - (metrics[task][na][curr]['exp'] * metrics[task][na][curr]['exp']))
    
    table.append([task, na, str(metrics[task][na]['meanAvgPrec']['exp'])+"~"+str(metrics[task][na]['meanAvgPrec']['sd']), str(metrics[task][na]['accuracy']['exp'])+"~"+str(metrics[task][na]['accuracy']['sd']), str(metrics[task][na]['kappa']['exp'])+"~"+str(metrics[task][na]['kappa']['sd']), na, na, na])
    
    for avg in ['micro', 'macro', 'weighted']:
        for curr in ['precision', 'recall', 'f1score']:
            metrics[task][avg][curr]['exp'] /= lstm.stratifications
            metrics[task][avg][curr]['sd'] = math.sqrt((metrics[task][avg][curr]['sd']/lstm.stratifications) - (metrics[task][avg][curr]['exp'] * metrics[task][avg][curr]['exp']))
        
        table.append([task, avg, na, na, na, str(metrics[task][avg]['precision']['exp'])+"~"+str(metrics[task][avg]['precision']['sd']), str(metrics[task][avg]['recall']['exp'])+"~"+str(metrics[task][avg]['recall']['sd']), str(metrics[task][avg]['f1score']['exp'])+"~"+str(metrics[task][avg]['f1score']['sd'])])  

print(tabulate(table))
        

In [None]:
def _calculateMicroMacroCurve(curveFunction, yt, yp):
        n_classes = yt.shape[1]
        abscissa = dict()
        ordinate = dict()
        area = dict()
        for f in range(lstm.stratifications):
            abscissa[f] = dict()
            ordinate[f] = dict()
            area[f] = dict()
        
            for c in range(n_classes):
                abscissa[f][c], ordinate[f][c] = curveFunction(yt[f][:, c], yp[f][:, c])
                area[f][c] = auc(abscissa[f][c], ordinate[f][c])
            abscissa[f]["micro"], ordinate[f]["micro"] = curveFunction(yt[f].ravel(), yp[f].ravel())
            area[f]["micro"] = auc(abscissa[f]["micro"], ordinate[f]["micro"])
        
        # aggregate all
        all_rec = list(filter(lambda x: not math.isnan(x), np.unique(np.concatenate([abscissa[i] for i in range(n_classes)]))))

        # interpolate all prec/rec curves at this points
        mean_ordinate = np.zeros_like(all_rec)
        representedClasses = 0
        unrepresentedClasses = 0
        for i in range(n_classes):
            interp = interp1d(abscissa[i], ordinate[i])
            curr_ordinate = interp(all_rec)
            if not np.any([math.isnan(x) for x in abscissa[i]]) and not np.any([math.isnan(x) for x in ordinate[i]]):
                mean_ordinate += curr_ordinate
                representedClasses += 1
            else:
                unrepresentedClasses += 1

        # average it and compute AUC
        mean_ordinate /= representedClasses

        abscissa["macro"] = all_rec
        ordinate["macro"] = mean_ordinate
        area["macro"] = auc(abscissa["macro"], ordinate["macro"])

        return (abscissa, ordinate, area)

In [None]:
rec = {}
pre = {}
pr_auc = {}
fpr = {}
tpr = {}
roc_auc = {}
for task in tasks:
    rec[task], pre[task], pr_auc[task] = _calculateMicroMacroCurve(lambda y,s: (lambda t: (t[1],t[0]))(precision_recall_curve(y,s)), yt[task], yp[task])
    fpr[task], tpr[task], roc_auc[task] = _calculateMicroMacroCurve(lambda y,s: (lambda t: (t[0],t[1]))(roc_curve(y,s)), yt[task], yp[task])

In [None]:
titles = {
    'sede1':'site', 
    'sede2':'subsite',
    'sede12':'full site', 
    'sede2ft':'subsite st',
    'sede2ft2':'subsite st',
    'sede2ft3':'full site st',
    'morfo1':'type',
    'morfo2':'behaviour',
    'morfo12':'type/behaviour',
    'morfo1ft4':'type st',
    'morfo1ft5':'type st',
    'morfo1ft6':'type st',
}

In [None]:
for task in tasks:
    fig = plt.figure()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(titles[task]+' Receiver operating characteristic')
    plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    #for c in range(n_classes):
    #    plt.plot(fpr[c], tpr[c], lw=1, label='ROC curve {} (area = {:0.2f})'.format(c, roc_auc[2]))
    plt.plot(fpr[task]['micro'], tpr[task]['micro'], color='blue', lw=2, label='ROC micro (area = %0.4f)' % roc_auc[task]['micro'])
    plt.plot(fpr[task]['macro'], tpr[task]['macro'], color='red', lw=2, label='ROC macro (area = %0.4f)' % roc_auc[task]['macro'])
    plt.legend(loc="lower right")
    plt.savefig(outputPlotDir+"/roc-"+task+".pdf", bbox_inches='tight')
    plt.show()

In [None]:
for task in tasks:
    fig = plt.figure()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
 
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(titles[task]+' Precision Recall curve')
    #for c in range(n_classes):
    #    plt.plot(fpr[c], tpr[c], lw=1, label='ROC curve {} (area = {:0.2f})'.format(c, roc_auc[2]))
    plt.plot(rec[task]['micro'], pre[task]['micro'], color='blue', lw=2, label='P/R micro (area = %0.4f)' % pr_auc[task]['micro'])
    plt.plot(rec[task]['macro'], pre[task]['macro'], color='red', lw=2, label='P/R macro (area = %0.4f)' % pr_auc[task]['macro'])
    plt.legend(loc="lower left")
    plt.savefig(outputPlotDir+"/pr-"+task+".pdf", bbox_inches='tight')
    plt.show()