# Code to view the ATLAS 2D data 
August 9, 2019


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py

import subprocess as sp
import pickle
import yaml

In [2]:
## M-L modules
import tensorflow.keras
from tensorflow.keras import layers, models, optimizers, callbacks  # or tensorflow.keras as keras
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tensorflow.keras.models import load_model



In [3]:
%matplotlib widget

## Modules

In [4]:

def load_config(config_file):
    with open(config_file) as f:
        config = yaml.load(f, Loader=yaml.SafeLoader)
    return config


def f_get_data(filename):
    '''
    Function to get data from hdf5 files into images, labels and weights.
    '''
    try: 
        hf = h5py.File(filename)

    except:
        print(e)
        print("Name of file",filename)
        raise SystemError

    idx=50000
    images = np.expand_dims(hf['all_events']['hist'][:idx], -1)
    labels = hf['all_events']['y'][:idx]
    weights = hf['all_events']['weight'][:idx]
    weights = np.log(weights+1)

    keys=['images','labels','weights']
    values_dict=dict(zip(keys,[images,labels,weights]))

    return values_dict



In [49]:

def f_define_model(config_dict,name='1'):
    '''
    Function that defines the model and compiles it.
    '''
    ### Extract info from the config_dict
    shape=config_dict['model']['input_shape']
    learn_rate=config_dict['optimizer']['lr']
    loss_fn=config_dict['training']['loss']
    metrics=config_dict['training']['metrics']
    dropout=config_dict['model']['dropout']
    
    inputs = layers.Input(shape=shape)
    h = inputs
    
    # Choose model
    
    if name=='1':
        # Convolutional layers
        conv_sizes=[10,10,10]
        conv_args = dict(kernel_size=(3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv2D(conv_size, **conv_args)(h)
            h = layers.MaxPooling2D(pool_size=(2, 2))(h)
            h = layers.Dropout(dropout)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(64, activation='relu')(h)
        h = layers.Dropout(dropout)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
        
        model = models.Model(inputs, outputs)
        #### change loss function for non-resnet models since 'sparse_categorical_crossentropy' throws up an error.
        opt=optimizers.Adam(lr=learn_rate)
    
    model.compile(optimizer=opt, loss=loss_fn, metrics=metrics)
    #print("model %s"%name)
    #model.summary()

    return model


def f_train_model(model,inpx,inpy,model_weights,num_epochs=5,batch_size=64):
    '''
    Train model. Returns just history.history
    '''
    cv_fraction=0.33 # Fraction of data for cross validation
    
    history=model.fit(x=inpx, y=inpy,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=1,
                    callbacks = [callbacks.EarlyStopping(monitor='val_loss', min_delta=0,patience=20, verbose=1),
                                 callbacks.ModelCheckpoint(model_weights, save_best_only=True, monitor='val_loss', mode='min') ],
                    validation_split=cv_fraction,
                    shuffle=True
                )
    
    print("Number of parameters",model.count_params())
    
    return history.history


def f_plot_learning(history):
    '''Plot learning curves : Accuracy and Validation'''
    fig=plt.figure()
    # Plot training & validation accuracy values
    fig.add_subplot(2,1,1)
    xlim=len(history['acc'])
    
    plt.plot(history['acc'],label='Train',marker='o')
    plt.plot(history['val_acc'],label='Validation',marker='*')
#     plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xticks(np.arange(0,xlim,2))
    
    # Plot loss values
    fig.add_subplot(2,1,2)
    plt.plot(history['loss'],label='Train',marker='o')
    plt.plot(history['val_loss'],label='Validation',marker='*')
#     plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.xticks(np.arange(0,xlim,2))

    plt.legend(loc='best')



def f_plot_roc_curve(fpr,tpr):
    '''
    Module for roc plot and printing AUC
    '''
    plt.figure()
    # plt.plot(fpr,tpr)
    plt.scatter(fpr,tpr)
    plt.semilogx(fpr, tpr)
  # Zooms
    plt.xlim([10**-7,1.0])
    plt.ylim([0,1.0])
    # y=x line for comparison
    x=np.linspace(0,1,num=500)
    plt.plot(x,x)
#     plt.xscale('log')
#     plt.xlim(1e-10,1e-5)
    plt.show()

    # AUC 
    auc_val = auc(fpr, tpr)
    print("AUC: ",auc_val)
    


In [46]:
config_file='config.yaml'
config_dict=load_config(config_file)
print(config)




{'description': 'CNN CIFAR10', 'data_dir': '/global/project/projectdirs/dasrepo/vpa/atlas_cnn/data/RPVSusyData/', 'output_dir': '/global/project/projectdirs/dasrepo/vpa/atlas_cnn/results/', 'data': {'name': 'cifar10'}, 'model': {'name': 'cnn', 'input_shape': [32, 32, 3], 'n_classes': 10, 'dropout': 0.1}, 'optimizer': {'name': 'Adam', 'lr': 0.001}, 'training': {'batch_size': 64, 'n_epochs': 32, 'lr_warmup_epochs': 0, 'loss': 'categorical_crossentropy', 'metrics': ['accuracy']}}


## Read data

In [26]:
### Extract the training and validation data
data_dir=config['data_dir']
#### Training data
filename=data_dir+'train.h5'
# print(filename)
train_data_dict=f_get_data(filename)

#### Test_data
filename=data_dir+'val.h5'
test_data_dict=f_get_data(filename)



In [27]:
train_x,train_y,train_wts=train_data_dict['images'],train_data_dict['labels'],train_data_dict['weights']

In [28]:
print(train_x.shape,train_y.shape,train_wts.shape)

(50000, 64, 64, 1) (50000,) (50000,)


In [47]:
train_x.shape[1:]
print(config_dict['model']['input_shape'])

[64, 64, 1]


### Define and train model

In [50]:
# print(train_data_dict)
# Compile model
model_name='1'
model_save_dir='saved_data/'
fname_model,fname_history='mdl_{0}_weights.h5'.format(model_name),'history_{0}.pickle'.format(model_name)

model=f_define_model(config_dict,name=model_name)
# print(model)
# Train model
history=f_train_model(model,train_x,train_y,model_weights=fname_model,num_epochs=50)


Train on 33500 samples, validate on 16500 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
 2816/33500 [=>............................] - ETA: 56s - loss: 0.2389 - acc: 0.9080

KeyboardInterrupt: 

### Save model and history

In [None]:
# Save model and history
# model.save(model_save_dir+fname_model) ### Model is saved automatically inside the fit function
with open(model_save_dir+fname_history, 'wb') as f:
        pickle.dump(history, f)


## Read stored model

In [None]:
# Load model and history
model=load_model(model_save_dir+fname_model)
with open(model_save_dir+fname_history,'rb') as f:
    history= pickle.load(f)

In [None]:
model.summary()
# Plot tested model
f_plot_learning(history)

### Test data

In [None]:
test_x,test_y,test_wts=test_data_dict['images'],test_data_dict['labels'],test_data_dict['weights']
print(test_x.shape,test_y.shape,test_wts.shape)

### Predictions and roc curve

In [None]:
# Make predictions
y_pred=model.predict(test_x,verbose=1)

fpr,tpr,threshold=roc_curve(test_y,y_pred,sample_weight=test_wts)
print(fpr.shape,tpr.shape,threshold.shape)
# Plot roc curve
f_plot_roc_curve(fpr,tpr)

In [None]:
data_dir

In [13]:
pwd

'/global/u1/v/vpa/project/jpt_notebooks/atlas_ml/code_vpa/jpt_notebooks'

In [17]:
? optimizers

[0;31mType:[0m        module
[0;31mString form:[0m <module 'tensorflow._api.v1.keras.optimizers' from '/global/homes/v/vpa/.conda/envs/v_py3/lib/python3.6/site-packages/tensorflow/_api/v1/keras/optimizers/__init__.py'>
[0;31mFile:[0m        ~/.conda/envs/v_py3/lib/python3.6/site-packages/tensorflow/_api/v1/keras/optimizers/__init__.py
[0;31mDocstring:[0m   Built-in optimizer classes.


In [34]:
type(optimizers),(optimizers.Adam)

(module, tensorflow.python.keras.optimizers.Adam)