# Code with interactive widgets to analyze trained models and plot validation and roc curves
Sept 3, 2019


## Steps:
- For a subset of models, read all data
- Store it in a summary dictionary
- Read from the dictionary for a specific model
- Plot learning curve, roc curves and print summary

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py

import subprocess as sp
import pickle
from ipywidgets import interact, interact_manual,fixed, SelectMultiple
import time

In [2]:
## M-L modules
# import tensorflow.keras
# from tensorflow.keras import layers, models, optimizers, callbacks  # or tensorflow.keras as keras
# import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tensorflow.keras.models import load_model


In [3]:
%matplotlib widget

## Modules

In [4]:
def f_get_data(filename):
    '''
    Function to get data from hdf5 files into images, labels and weights.
    '''
    try: 
        hf = h5py.File(filename)

    except Exception as e:
        print(e)
        print("Name of file",filename)
        raise SystemError

    idx=None
    images = np.expand_dims(hf['all_events']['hist'][:idx], -1)
    labels = hf['all_events']['y'][:idx]
    weights = hf['all_events']['weight'][:idx]
    weights = np.log(weights+1)

    keys=['images','labels','weights']
    values_dict=dict(zip(keys,[images,labels,weights]))

    return values_dict


def f_plot_learning(history):
    '''Plot learning curves : Accuracy and Validation'''
    fig=plt.figure()
    # Plot training & validation accuracy values
    fig.add_subplot(2,1,1)
    xlim=len(history['acc'])
    
    plt.plot(history['acc'],label='Train',marker='o')
    plt.plot(history['val_acc'],label='Validation',marker='*')
#     plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xticks(np.arange(0,xlim,2))
    
    # Plot loss values
    fig.add_subplot(2,1,2)
    plt.plot(history['loss'],label='Train',marker='o')
    plt.plot(history['val_loss'],label='Validation',marker='*')
#     plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.xticks(np.arange(0,xlim,2))

    plt.legend(loc='best')



def f_plot_roc_curve(fpr,tpr):
    '''
    Module for roc plot and printing AUC
    '''
    plt.figure()
    # plt.plot(fpr,tpr)
    plt.scatter(fpr,tpr)
    plt.semilogx(fpr, tpr)
  # Zooms
    plt.xlim([10**-7,1.0])
    plt.ylim([0,1.0])
#   #y=x line for comparison
    x=np.linspace(0,1,num=500)
    plt.plot(x,x)
#     plt.xscale('log')
#     plt.xlim(1e-10,1e-5)

    # AUC 
#     auc_val = auc(fpr, tpr)
#     print("AUC: ",auc_val)

## Read stored model

In [5]:
def f_read_stored_data(model_save_dir,model_name):
    '''
    Read model, history and predictions
    '''
    
    
    fname_model='model_{0}.h5'.format(model_name)
    fname_history='history_{0}.pickle'.format(model_name)

    # Load model and history
    model=load_model(model_save_dir+fname_model)
    with open(model_save_dir+fname_history,'rb') as f:
        history= pickle.load(f)

    # Load predictions
    # y_pred=model.predict(test_x,verbose=1)
    fname_ypred=model_save_dir+'ypred_{0}.test'.format(model_name)
#     print(fname_ypred)
    y_pred=np.loadtxt(fname_ypred)
    
    
    return model, history, y_pred

In [12]:

## Since reading data takes a bit of time, we first read a subset of models and then analyze them

def f_real_all_data(model_save_dir,model_name_list):
    '''
    Read stored data, plot learning and roc curves, print model summary
    '''
    
    dict_summary=dict.fromkeys(model_name_list,None)
    
    for model_name in model_name_list:
#         model_save_dir='/global/project/projectdirs/dasrepo/vpa/atlas_cnn/results/'
        model,history,y_pred=f_read_stored_data(model_save_dir,model_name)

        ### Extract the training and validation data
        data_dir='/global/project/projectdirs/dasrepo/vpa/atlas_cnn/data/RPVSusyData/'
#         data_dir='data/'

        #### Test_data
        filename=data_dir+'val.h5'
        test_data_dict=f_get_data(filename)
        
        dict1={'name':model_name,'model':model,'history':history,'y_pred':y_pred, 'test_data':test_data_dict}
        dict_summary[model_name]=dict1
        
    return dict_summary


def f_analyze_model(model_name,dict_summary,learning_curve=True,plot_roc=True,summary=False):
    '''
    Analyze model
    '''

    ### Pick up data stored in summary dictionary
    dict1=dict_summary[model_name]
    model,history,test_data_dict,y_pred=dict1['model'],dict1['history'],dict1['test_data'],dict1['y_pred']
    
    
    test_x,test_y,test_wts=test_data_dict['images'],test_data_dict['labels'],test_data_dict['weights']
    print(test_x.shape,test_y.shape,y_pred.shape,test_wts.shape)
    ## roc curve
    fpr,tpr,threshold=roc_curve(test_y,y_pred,sample_weight=test_wts)
#     print(fpr.shape,tpr.shape,threshold.shape)
    # Plot tested model
    if learning_curve: f_plot_learning(history)
        
    ## Plot roc curve
    if plot_roc: f_plot_roc_curve(fpr,tpr)
    
    ## Model summary
    if summary: model.summary()

        
def f_compare_rocs(model_name,dict_summary):
    '''
    Analyze model
    '''

    ### Pick up data stored in summary dictionary
#     print(model_name,type(model_name))
    
    
    plt.figure()

    for model_num in model_name:
        dict1=dict_summary[model_num]
        model,history,test_data_dict,y_pred=dict1['model'],dict1['history'],dict1['test_data'],dict1['y_pred']


        test_x,test_y,test_wts=test_data_dict['images'],test_data_dict['labels'],test_data_dict['weights']
#         print(test_x.shape,test_y.shape,y_pred.shape,test_wts.shape)
        ## roc curve
        fpr,tpr,threshold=roc_curve(test_y,y_pred,sample_weight=test_wts)

        ## Plot roc curve
        plt.scatter(fpr,tpr,label='model: '+model_num)
        plt.semilogx(fpr, tpr,linestyle='')
        plt.legend(loc='best')




### First store data for a subset of models

In [13]:
model_save_dir='/global/project/projectdirs/dasrepo/vpa/atlas_cnn/results/2_runs_Sept13_modified/'
# model_save_dir='saved_data/2_runs_Sept13_modified/'
# model_sublist=['1','2','3','4','5','20','30']
model_sublist=[str(i) for i in [0,1,2,3,4,5,6,7,8,9,10,11,12,13,20,30]]

dict_summary=f_real_all_data(model_save_dir,model_sublist)

W0916 08:48:39.077708 46912496621568 deprecation.py:506] From /global/homes/v/vpa/.conda/envs/v_py3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [15]:
print(dict_summary.keys())
# dir(dict_summary.keys())
# #print(dict_summary)


dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '20', '30'])


### Generate plots and summary 
Read from dictionary **dict_summary**

In [16]:
f_analyze_model('1',dict_summary)

(137471, 64, 64, 1) (137471,) (137471,) (137471,)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
interact_manual(f_analyze_model,dict_summary=fixed(dict_summary),model_name=model_sublist )


interactive(children=(Dropdown(description='model_name', options=('0', '1', '2', '3', '4', '5', '6', '7', '8',…

<function __main__.f_analyze_model(model_name, dict_summary, learning_curve=True, plot_roc=True, summary=False)>

In [18]:
# f_compare_rocs(('1','2'),dict_summary)
interact_manual(f_compare_rocs,model_name=SelectMultiple(options=model_sublist),dict_summary=fixed(dict_summary))


interactive(children=(SelectMultiple(description='model_name', options=('0', '1', '2', '3', '4', '5', '6', '7'…

<function __main__.f_compare_rocs(model_name, dict_summary)>

In [None]:

# f_compare_rocs(('1','2'),dict_summary)