## Exotic searches at ATLAS with NN Classification

## Marking

You will get marks for completeing the different tasks within this notebook:

Any code expected for you to complete will contain `## FINISH ME` indicating the code isn't expected to run until you have completed it.


| <p align='left'> Title                         | <p align='left'> Number of marks |
| -------------------------------------  | --- |
| <p align='left'> Workshop Exercise 1                     | <p align='left'> 2 |
| <p align='left'> Workshop Exercise 2                     | <p align='left'> 2 |
| <p align='left'> Workshop Exercise 3                                   | <p align='left'> 1 |
| <p align='left'> Assessment Exercise 1                                     | <p align='left'> 2 |
| <p align='left'> Assessment Exercise 2                               | <p align='left'> 2 |
| <p align='left'> Assessment Exercise 3                       | <p align='left'> 1 |
| <p align='left'> **Total** | <p align='left'> max **10** |

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
#  The next command should not be necessary, but may be used to force plots to appear inline in the notebook (if they're not showing up)
#  %matplotlib inline

### Workshop Exercise 1: Import, clean, and visualize data

In [None]:
import os    # for loading data from a local source

foldername = 'Files/'
bgd_files = ['Diboson.csv','Top.csv','Zjets.csv']
sig_files = ['ggH1000.csv']

bgd_df = []
sig_df = []
all_files = bgd_files + sig_files

for index, file in enumerate(all_files):
    size = os.path.getsize(foldername + file)/(1024*1024)
    print ('Opening file',file,'with size',"{:.1f}".format(size),'MB:',)
    tmp = pd.read_csv(foldername + file, index_col=0)      # reads csv files into a pandas DataFrame
    if index < len(bgd_files):
        bgd_df.append(tmp)
    else:
        sig_df.append(tmp)
    print ('Done')
    

In [None]:
# sanity check: check the first few rows and columns in the data
print(sig_df[0].iloc[:5,:12])
print(bgd_df[0].iloc[:5,:12])

In [None]:
# sanity check: check if cleaning is required to remove empty (NA) values -- and clean
print ("# of entries before cleaning S:\n", sig_df[0].count())
print ("# of entries before cleaning B0:\n", bgd_df[0].count())
print ("# of entries before cleaning B1:\n", bgd_df[1].count())
print ("# of entries before cleaning B2:\n", bgd_df[2].count())
sig_df[0].dropna(inplace = True)
    ## FINISH ME
    ## FINISH ME
    ## FINISH ME
print ("# of entries after cleaning S:\n", sig_df[0].count())
print ("# of entries after cleaning B0:\n", bgd_df[0].count())
print ("# of entries after cleaning B1:\n", bgd_df[1].count())
print ("# of entries after cleaning B2:\n", bgd_df[2].count())

In [None]:
import numpy as np
features = ['lep1_pt', ## FINISH ME using the features from Table 1, excluding the Boolean flag isSignal

Nbins = 100
for var in features:
    #adopt a common binning scheme for all channels
    bins_ = np.linspace(min(sig_df[0][var]), max(sig_df[0][var]), Nbins)
    
    plt.hist(bgd_df[0][var], histtype='step', density=True, bins=bins_, label='Dibosons', linewidth=2)
    plt.hist(bgd_df[1][var], ## FINISH ME
       ## FINISH ME: get three background datasets and the one signal dataset all in a single plot, by calling plt.hist four times
    
    
    plt.xlabel(var)
    plt.yscale('log')
    plt.legend(loc='best')
    plt.show()

### Workshop Exercise 2: Create the dataset for the classifier; plot correlations in features to see some differences between signal and background

In [24]:
output = ['isSignal']
wtype = ['Background', 'Signal']

In [None]:
# put together all background samples to produce one mega-DataFrame
totalBgd_df = pd.concat(bgd_df, ignore_index = True)
print ("total # of bgd events =",totalBgd_df.shape[0])
print ("total # of sig events =",sig_df[0].shape[0])

In [None]:
# randomise the background samples
# previously, we randomised (shuffled) the data while constructing pytorch dataloaders; this is a hands-on alternative

from sklearn.utils import shuffle

def Randomise(df, random_seed):
    df = shuffle(df, random_state=random_seed)
    df = df.reset_index(drop=True) # do not insert a new column with the new index
    return df

Answer_to_all_questions = 42            # random seed for reproducibility
print(totalBgd_df.iloc[:5,:12])
totalBgd_df = Randomise(totalBgd_df, Answer_to_all_questions)
print(totalBgd_df.iloc[:5,:12])

In [None]:
# create a new dataset with equal #s of signal and background events
Nsig = sig_df[0].shape[0]
sigbgd_tmp = [totalBgd_df[0: ## FINISH ME], sig_df[0]]
# merge these two dataframes into one
sigbgd = pd.concat(  ## FINISH ME
# randomise the new sample with equal #s of signal and background
sigbgd = Randomise(  ## FINISH ME
# check out the new dataframe
print(sigbgd.head(5))
print ("total # of events =",sigbgd.shape[0])
print ("# of signal events in new DF =",len(sigbgd[sigbgd.isSignal == 1]))
print ("# of background events in new DF =",  ## FINISH ME )

In [None]:
features = ['lep1_pt', ## FINISH ME using just the first eight features from Table 1

# reduce to desired features + output
dataset = sigbgd[features + output]
print (dataset.shape)

In [None]:
data = dataset[features].values
target = dataset[output].values
print (data.shape, target.shape)

In [None]:
from dt_utils import featureplot
N_classes = 2
featureplot(data, target, N_classes, t_names = features, c_names = wtype)

# if you have 9 features, this will create (9 choose 2) = 9!/(2! 7!) = 36 plots. 
# if you have 8 features, this will create (8 choose 2) = 8!/(2! 6!) = 28 plots.

### Workshop Exercise 3: Rescale data and convert to PyTorch tensor

In [39]:
# feature scaling: we will now standardise the inputs (=scale their ranges so that they are roughly the same)
# recall that, previously, we used torchvision transforms while loading datasets to do something similar; this is an alternative
from sklearn import model_selection, preprocessing
sc = preprocessing.StandardScaler()
data = sc.fit_transform(data)

In [None]:
Answer_to_all_questions = 42

# train-test split of dataset and convert to pytorch tensors
train_data, test_data, train_target, test_target = model_selection.train_test_split(
    data, target, test_size=0.3, random_state=Answer_to_all_questions)

print(train_data.shape, train_target.shape, test_data.shape, test_target.shape)

import torch

def xNumpyToTensor(array):
    array = np.array(array, dtype=np.float32) 
    return torch.from_numpy(array).type(torch.FloatTensor)

def yNumpyToTensor(array):
    array = np.array(array.astype(int))
    return torch.from_numpy(array).type(torch.FloatTensor)

train_data_tensor = xNumpyToTensor(train_data)
train_target_tensor = ## FINISH ME
test_data_tensor = ## FINISH ME
test_target_tensor = ## FINISH ME


### Assessment Exercise 1: construct and train the NN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, log_loss

In [45]:
N_epochs = 200
num_nodes = 40
num_inputs = 8    # num of inputs = 8 or 9
num_outputs = 1   # num of outputs = 1

In [None]:
dropout = nn.Dropout(p=0.2)     # see https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html

# Define a PyTorch NN with just one hidden layer of width num_nodes
# so num_inputs -> num_nodes -> num_nodes -> num_outputs
# with activation function ReLu and dropout after the first two transformations, and a final Sigmoid activation function
# altogether:  8 (or 9) -> 40, ReLu, dropout -> 40, ReLu, dropout -> 1, Sigmoid
# (Think: why a final Sigmoid function?)

def my_model(num_inputs, num_nodes):
    model = nn.Sequential(
		nn.Linear( ## FINISH ME ), nn.ReLU(), dropout,
		nn.Linear( ## FINISH ME ), ## FINISH ME
		nn.Linear( ## FINISH ME ), ## FINISH ME
	)
    return model

In [46]:
model = my_model(num_inputs, num_nodes)

In [None]:


# Train the model
loss_fn   = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)
all_losses = []
all_valid_accuracies = []

#Training in batches
for step in range(N_epochs):    
    model.train()
    out = model(train_data_tensor)                 # input x and predict based on x
    cost = loss_fn(out,   ## FINISH ME
    optimizer.zero_grad()   # clear gradients for next train
    cost.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients 

    step_size = 10
    
    loss = cost.item()
    all_losses.append(loss)
    if step % step_size == 0:        
        print(step, cost.data.cpu().numpy())

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        prediction = model(test_data_tensor)  # probabilities
        predicted_labels = (prediction > 0.5).float()  # Convert probabilities to 0 or 1
        actual_labels = test_target_tensor
        accuracy = (predicted_labels.eq(actual_labels).sum() / float(actual_labels.nelement())).item()  # Calculate the accuracy
        all_valid_accuracies.append(accuracy)
        if step % step_size == 0:        
            print('Validation accuracy: {:.1f}%'.format(accuracy * 100))
        
    # RuntimeError: can't convert CUDA tensor to numpy (it doesn't support GPU arrays). 
    # Use .cpu() to move the tensor to host memory first.        
    ####prediction = (model(test_data_tensor).data).float() # probabilities                  
    pred_y = prediction.cpu().numpy().squeeze()
    target_y = test_target_tensor.cpu().data.numpy()
    if step % step_size == 0:        
        print ('LOG_LOSS={} '.format(log_loss(target_y, pred_y))) 



#### Now plot the losses (all_losses) and the accuracies (all_valid_accuracies) across all training epochs.

In [None]:
##  FINISH ME


### Assessment Exercise 2: Improve the NN

In [None]:
# Your code here!   ## FINISH ME
# Try to get better than 95% accuracy.



### Assessment Exercise 3: Visualise what's happening

In [None]:
# Plot a confusion matrix

import seaborn as sns
cm = confusion_matrix( ## FINISH ME )
nn_utils.heatmap(cm, labels=['Predicted', 'True'], 
        classes=[wtype,wtype],
        normalize=True)
#sns.heatmap(cm, annot=True)
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
# Get 'Receiver operating characteristic' (ROC)
fpr, tpr, thresholds = roc_curve(test_target, pred_y)

# Compute "Area Under the Curve" (AUC) from prediction scores
roc_auc  = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=2, label='Area under curve: %0.2f)' % roc_auc)
plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.ylabel('True Positive Rate (Signal efficiency)')
plt.xlabel('False Positive Rate (Background efficiency)')
plt.title('ROC curve: Higgs signal vs. SM background')
plt.legend(loc="best")
plt.show()