# Code to runn CNNs on supernova data
Feb 14, 2020


In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py

import subprocess as sp
import pickle
import yaml
import pandas as pd

In [3]:
## M-L modules
import tensorflow.keras
from tensorflow.keras import layers, models, optimizers, callbacks  # or tensorflow.keras as keras
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tensorflow.keras.models import load_model



In [3]:
%matplotlib widget

## Modules

In [5]:

def load_config(config_file):
    with open(config_file) as f:
        config = yaml.load(f, Loader=yaml.SafeLoader)
    return config



In [64]:

def f_define_model(config_dict,name='1'):
    '''
    Function that defines the model and compiles it.
    '''
    ### Extract info from the config_dict
    shape=config_dict['model']['input_shape']
    learn_rate=config_dict['optimizer']['lr']
    loss_fn=config_dict['training']['loss']
    metrics=config_dict['training']['metrics']
    dropout=config_dict['model']['dropout']
    
    inputs = layers.Input(shape=shape)
    h = inputs
    
    # Choose model
    
    if name=='1':
        # Convolutional layers
        conv_sizes=[10,10,10]
        conv_args = dict(kernel_size=(3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv2D(conv_size, **conv_args)(h)
            h = layers.MaxPooling2D(pool_size=(2, 2))(h)
            h = layers.Dropout(dropout)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(64, activation='relu')(h)
        h = layers.Dropout(dropout)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
        
        model = models.Model(inputs, outputs)
        #### change loss function for non-resnet models since 'sparse_categorical_crossentropy' throws up an error.
        opt=optimizers.Adam(lr=learn_rate)
    
    model.compile(optimizer=opt, loss=loss_fn, metrics=metrics)
    #print("model %s"%name)
    #model.summary()

    return model


def f_train_model(model,inpx,inpy,model_weights,num_epochs=5,batch_size=64):
    '''
    Train model. Returns just history.history
    '''
    cv_fraction=0.33 # Fraction of data for cross validation
    
    history=model.fit(x=inpx, y=inpy,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=1,
                    callbacks = [callbacks.EarlyStopping(monitor='val_loss', min_delta=0,patience=20, verbose=1),
                                 callbacks.ModelCheckpoint(model_weights, save_best_only=True, monitor='val_loss', mode='min') ],
                    validation_split=cv_fraction,
                    shuffle=True
                )
    
    print("Number of parameters",model.count_params())
    
    return history.history


def f_plot_learning(history):
    '''Plot learning curves : Accuracy and Validation'''
    fig=plt.figure()
    # Plot training & validation accuracy values
    fig.add_subplot(2,1,1)
    xlim=len(history['acc'])
    
    plt.plot(history['acc'],label='Train',marker='o')
    plt.plot(history['val_acc'],label='Validation',marker='*')
#     plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xticks(np.arange(0,xlim,2))
    
    # Plot loss values
    fig.add_subplot(2,1,2)
    plt.plot(history['loss'],label='Train',marker='o')
    plt.plot(history['val_loss'],label='Validation',marker='*')
#     plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.xticks(np.arange(0,xlim,2))

    plt.legend(loc='best')



def f_plot_roc_curve(fpr,tpr):
    '''
    Module for roc plot and printing AUC
    '''
    plt.figure()
    # plt.plot(fpr,tpr)
    plt.scatter(fpr,tpr)
    plt.semilogx(fpr, tpr)
  # Zooms
    plt.xlim([10**-7,1.0])
    plt.ylim([0,1.0])
    # y=x line for comparison
    x=np.linspace(0,1,num=500)
    plt.plot(x,x)
#     plt.xscale('log')
#     plt.xlim(1e-10,1e-5)
    plt.show()

    # AUC 
    auc_val = auc(fpr, tpr)
    print("AUC: ",auc_val)
    


In [72]:
def f_get_data(data_dir):
    '''
    Function to get data from .npy files into images and labels.
    '''
    try:
        
        #images=np.load(data_dir+prefix+'_x.npy')
        images=np.load(data_dir+'full_x.npy')
        labels=np.load(data_dir+'full_y.npy')
    except Exception as e:
        print(e)
        raise SystemExit

    
    keys=['images','labels']
    values_dict=dict(zip(keys,[images,labels]))
    
    return values_dict

## Read data

In [73]:
config_file='config_cori.yaml'
config_dict=load_config(config_file)

train_data_dict=f_get_data()
train_x,train_y=train_data_dict['images'],train_data_dict['labels']
print(train_x.shape,train_y.shape)

# config_dict['model']['input_shape']=[51,51,1]

Proportion of Signal-Background: 49291-50709.
Proportion of Signal: 0.49291
(100000, 51, 51, 1) (100000,)


### Define and train model

In [80]:
# print(train_data_dict)
# Compile model
model_name='1'
model_save_dir='saved_data/'
fname_model,fname_history='mdl_{0}_weights.h5'.format(model_name),'history_{0}.pickle'.format(model_name)

model=f_define_model(config_dict,name=model_name)
# print(model)
# Train model
history=f_train_model(model,train_x,train_y,model_weights=fname_model,num_epochs=50)


Train on 67000 samples, validate on 33000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Number of parameters 25089


### Save model and history

In [None]:
# Save model and history
# model.save(model_save_dir+fname_model) ### Model is saved automatically inside the fit function
with open(model_save_dir+fname_history, 'wb') as f:
        pickle.dump(history, f)


## Read stored model

In [None]:
# Load model and history
model=load_model(model_save_dir+fname_model)
with open(model_save_dir+fname_history,'rb') as f:
    history= pickle.load(f)

In [None]:
model.summary()
# Plot tested model
f_plot_learning(history)

### Test data

In [None]:
test_x,test_y,test_wts=test_data_dict['images'],test_data_dict['labels'],test_data_dict['weights']
print(test_x.shape,test_y.shape,test_wts.shape)

### Predictions and roc curve

In [None]:
# Make predictions
y_pred=model.predict(test_x,verbose=1)

fpr,tpr,threshold=roc_curve(test_y,y_pred,sample_weight=test_wts)
print(fpr.shape,tpr.shape,threshold.shape)
# Plot roc curve
f_plot_roc_curve(fpr,tpr)