### Required Modules

In [None]:
import numpy as np 
import pandas as pd
import h5py as h5
import seaborn as sns
import tensorflow as tf
import sys

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
import pickle

In [None]:
# The inputs to the Neural Network
with h5.File('../qlk_jetexp_nn_training_database_minimal.h5', "r") as f:
        inputs = f['input']['block0_values'][()]
        input_names = f['input']['block0_items'][()]
        index_inp = f['input']['axis1'][()]   #row number from 0 to len(inputs)
        
        # The target outputs for the NN
        outputs = f['output']['block0_values'][()]
        output_names = f['output']['block0_items'][()]
        index_out = f['output']['axis1'][()]   #row number from 0 to len(inputs) with some missing rows
        

In [None]:
#Load the data into the dataframe
df_in = pd.DataFrame(inputs,index_inp,input_names)
df_out = pd.DataFrame(outputs,index_out, output_names)

### Load Data 

In [None]:
train_data = pd.read_pickle("/share/rcifdata/jbarr/UKAEAGroupProject/data/train_data.pkl")

X_train, Y_train = train_data.iloc[:,:-1].to_numpy(), train_data.iloc[:,-1].to_numpy()

validation_data = pd.read_pickle("/share/rcifdata/jbarr/UKAEAGroupProject/data/validation_data.pkl")

X_val, Y_val = validation_data.iloc[:,:-1].to_numpy(), validation_data.iloc[:,-1].to_numpy()

In [None]:
# standard scaler
scaler = StandardScaler()
scaler.fit(X_train)
x_train = scaler.transform(X_train)
x_val = scaler.transform (X_val)


### Grid Search: Network Depth and Node Number

In [None]:
parameters = {
    'nodes': [5,10,20,30],
    'layers': [2,3,4]
}

In [None]:
def grid_search(build_fn, parameters, train_data, val_data): 
    '''
    Inputs: 
        build_fn: a function that will be used to build the neural network
        parameters: a dictionary of model parameters
        train_data: 
        val_data
    '''
    
    # unpack data 
    
    x_train, y_train = train_data
    
    x_val, y_val = val_data
    
    
    results_dict = {}
    
    
    counter = 0
    
    best_val_loss = sys.float_info.max
    
    for i in parameters['layers']:
        
        
        #List of possible node combinations
        n = i 
        nodes = tuple([parameters['nodes'] for j in range(i)])
        
        combs = np.array(np.meshgrid(*nodes)).T.reshape(-1,n)
        
        for node in combs:
    
        
            # build model
            model = build_fn(i,node)
            
            model.compile(optimizer = 'adam', loss ='binary_crossentropy', metrics = 'acc')
            
            history = model.fit(x_train, y_train,batch_size = 4096, epochs =25)
            
            evaluate = model.evaluate(x_val, y_val, batch_sze = 4096)
            
                        
            trial_dict = {
                'layers': i,
                'nodes': node,
                'history': history, 
                'perfomance': evaluate
            }
            
            
            
            if evaluate[1] < best_val_loss: 
                results_dict['best_model'] = trial_dict
        
            
            results_dict['trial'+str(counter)] = trial_dict
            
            counter += 1

In [None]:
def build_classifier(n_layers,nodes):
    model = tf.keras.Sequential()
    
    # Flexible number of hidden layers
    for i in range(n_layers):
        model.add(tf.keras.layers.Dense(nodes[i],activation ='relu'))
    
    # Final classifer layer 
    model.add(tf.keras.layers.Dense(1, activation ='sigmoid'))
    
    return model

###  Best Neural Network Classifier

In [None]:
def nn_classifier():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation = 'relu'),
    tf.keras.layers.Dense(10, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')   
    ])
    return model

In [None]:
model = nn_classifier()

In [None]:
model.compile(optimizer = 'adam', loss ='binary_crossentropy', metrics = 'acc')

In [None]:
history = model.fit(x_train, Y_train, validation_data = (x_val, Y_val), batch_size = 4096, epochs =25)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure()
plt.plot(history.history['acc'], 'o', label = 'Train acc')
plt.plot(history.history['val_acc'], 'o', label = 'Val acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
plt.figure()
plt.plot(history.history['loss'], 'o', label = 'Train loss')
plt.plot(history.history['val_loss'], 'o', label = 'Val loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

### Evaluate Model on Test Set

In [None]:
from sklearn.metrics import roc_curve

In [None]:
test_data = pd.read_pickle("/share/rcifdata/jbarr/UKAEAGroupProject/test_data.pkl")

X_test, Y_test = test_data.iloc[:,:-1].to_numpy(), test_data.iloc[:,-1].to_numpy()

In [None]:
x_test = scaler.transform(X_test)

In [None]:
predictions = model.predict(x_test)

fpr, tpr, thresholds = roc_curve(Y_test, predictions)

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(Y_test, predictions)

In [None]:
plt.figure()
plt.plot(fpr, tpr)
random_class = np.arange(0,1,0.005)
plt.plot(random_class, random_class, '--')
plt.xlabel('False Positive Rate')
plt.ylabel('Flase Negative Rate')
plt.text(0.8, 0.2, f'auc = {auc: .2f}', fontsize=10)

### Distributions from classifier

In [None]:
preds = np.round(predictions).flatten()

In [None]:
no_output = x_test[np.where(preds == 0)]
yes_output = x_test[np.where(preds == 1)] 
assert no_output.shape[0] + yes_output.shape[0] == x_test.shape[0]

In [None]:
columns = list(train_data.iloc[:,:-1].columns)
print(columns)

In [None]:
for i, column in enumerate(columns):
    plt.figure()
    plt.hist(no_output[:,i], histtype = 'step', color = 'lime', label =" No output", density = True);
    plt.hist(yes_output[:,i], histtype = 'step', color = 'purple', label = "Output", density = True);
    plt.legend()
    plt.xlabel(column)