<a href="https://colab.research.google.com/github/yanncoadou/MLtutorials/blob/main/AstroInfo2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>AstroInfo 2021 Machine learning hands-on</h1>

# Standard imports and practical functions

In [None]:
# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_circles
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score, roc_auc_score, roc_curve, RocCurveDisplay

%matplotlib inline
import seaborn as sns # seaborn for nice plots
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
np.random.seed(31415) # set the np random seed for reproducibility

### Function to plot decision contours

In [None]:
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

def my_plot_decision_regions(model, X, y, alpha=1.0, size=25, npts=10000, zoom=0.25, event5=False):
  x1min = X[:,0].min() - zoom
  x1max = X[:,0].max() + zoom

  x2min = X[:,1].min() - zoom
  x2max = X[:,1].max() + zoom
  
  x1 = np.random.uniform(x1min, x1max, npts)
  x2 = np.random.uniform(x2min, x2max, npts)

  if hasattr(model, "predict_proba"):
    z = model.predict_proba(np.vstack((x1,x2)).T)
  else:
    z = model.predict(np.vstack((x1,x2)).T)
  
  if len(z.shape) == 2:
    if z.shape[1] == 1:
      z = z.reshape(-1)
    elif z.shape[1] == 2:
      z = z[:,1].reshape(-1)

  fig, ax = plt.subplots()

  bottom = cm.get_cmap('Oranges', 128)
  top = cm.get_cmap('Blues_r', 128)

  newcolors = np.vstack((top(np.linspace(0, 1, 128+128)[-128:]),
                        bottom(np.linspace(0, 1, 128+128)[:128])))
  newcmp = ListedColormap(newcolors, name='OrangeBlue')


  ax.tricontour(x1, x2, z, levels=np.linspace(0.0-np.finfo(float).eps,1.0+np.finfo(float).eps,20,True), linewidths=0.1, colors='k', antialiased=True)
  cntr = ax.tricontourf(x1, x2, z, levels=np.linspace(0.0-np.finfo(float).eps,1.0+np.finfo(float).eps,20,True), cmap=newcmp)
  sctr0 = ax.scatter(X[y==0][:,0], X[y==0][:,1], alpha=alpha, s=size, c="#1f77b4", marker="s", edgecolors="k", linewidths=0.5)
  sctr1 = ax.scatter(X[y==1][:,0], X[y==1][:,1], alpha=alpha, s=size, c="#ff7f0e",  marker="^", edgecolors="k", linewidths=0.5)
  if event5: # showing particular swinger event
    sctr2 = ax.scatter(X[4][0], X[4][1], alpha=1, s=size*10, c="lightgreen",  marker="X", edgecolors="k", linewidths=1)
  fig.colorbar(cntr, ax=ax)
  # ax.set(xlim=(x1min, x1max), ylim=(x2min, x2max))

  plt.show()

### Function to plot ROC curve

In [None]:
def my_plot_roc_curve(model, X_test, y_test):
  if hasattr(model, "predict_proba"):
    y_scores = model.predict_proba(X_test)
  else:
    y_scores = model.predict(X_test)

  if len(y_scores.shape) == 2:
    if y_scores.shape[1] == 1:
      y_scores = y_scores.reshape(-1)
    elif y_scores.shape[1] == 2:
      y_scores = y_scores[:,1].reshape(-1)
  fpr, tpr, _ = roc_curve(y_test, y_scores)
  roc_auc = roc_auc_score(y_test, y_scores)
  plt.clf()
  display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=model.__class__.__name__)
  display.plot()
  plt.plot([0, 1], [0, 1], color='black', linestyle='--')
  plt.show()

# Defining datasets

In [None]:
# X = (x,y) coordinates; y = class
X1, y1 = make_circles(n_samples=1000, noise=0.1, factor=0.8)
X2, y2 = make_circles(n_samples=1000, noise=0.2, factor=0.2)
X = np.vstack((X1,X2/2))
y = np.hstack((y1,y2))

# Splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y);

# Classifier zoo

## Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()
display(dtc.get_params())

In [None]:
dtc.fit(X_train, y_train)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(15,10))
plot_tree(dtc)
plt.show();


In [None]:
accuracy_score(y_test, dtc.predict(X_test))

Access to results:
- `predict` returns the class (0 or 1 if binary classifier)
- `predict_proba` returns the probability of each class



In [None]:
print("predict: \n",dtc.predict(X_test[:5]))
print("predict_proba: \n",dtc.predict_proba(X_test[:5]))

In [None]:
try:
  from mlxtend.plotting import plot_decision_regions
except ImportError as e:
  !pip install mlxtend
  from mlxtend.plotting import plot_decision_regions

In [None]:
# practical but limited contour-plotting function
plot_decision_regions(X_test, y_test, dtc);

In [None]:
# defined at top of notebook
# can use class (0 or 1) or class probability when available
my_plot_decision_regions(dtc, X_test, y_test)

In [None]:
my_plot_roc_curve(dtc, X_test, y_test)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#abc = AdaBoostClassifier()
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),n_estimators=100)
display(abc.get_params())

In [None]:
abc.fit(X_train, y_train)

In [None]:
print("predict: \n",abc.predict(X_test[:5]))
print("predict_proba: \n",abc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(abc, X_test, y_test)
my_plot_roc_curve(abc, X_test, y_test)

## Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=400,verbose=1)
display(gbc.get_params())

In [None]:
gbc.fit(X_train, y_train)

In [None]:
print("predict: \n",gbc.predict(X_test[:5]))
print("predict_proba: \n",gbc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(gbc, X_test, y_test, event5=True)
my_plot_roc_curve(gbc, X_test, y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=400,verbose=1)
display(rfc.get_params())

In [None]:
rfc.fit(X_train, y_train)

In [None]:
print("predict: \n",rfc.predict(X_test[:5]))
print("predict_proba: \n",rfc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(rfc, X_test, y_test)
my_plot_roc_curve(rfc, X_test, y_test)

## Comparison

In [None]:
y_preds_dtc = dtc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_abc = abc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_gbc = gbc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_rfc = rfc.predict_proba(X_test)[:,1].reshape(-1)
fpr_dtc,tpr_dtc,_ = roc_curve(y_true=y_test, y_score=y_preds_dtc)
fpr_abc,tpr_abc,_ = roc_curve(y_true=y_test, y_score=y_preds_abc)
fpr_gbc,tpr_gbc,_ = roc_curve(y_true=y_test, y_score=y_preds_gbc)
fpr_rfc,tpr_rfc,_ = roc_curve(y_true=y_test, y_score=y_preds_rfc)
auc_test_dtc = roc_auc_score(y_true=y_test, y_score=y_preds_dtc)
auc_test_abc = roc_auc_score(y_true=y_test, y_score=y_preds_abc)
auc_test_gbc = roc_auc_score(y_true=y_test, y_score=y_preds_gbc)
auc_test_rfc = roc_auc_score(y_true=y_test, y_score=y_preds_rfc)
plt.plot(fpr_dtc, tpr_dtc, color='darkblue',label='{} (AUC  = {})'.format(dtc.__class__.__name__,np.round(auc_test_dtc,decimals=2)))
plt.plot(fpr_abc, tpr_abc, color='darkred',label='{} (AUC  = {})'.format(abc.__class__.__name__,np.round(auc_test_abc,decimals=2)))
plt.plot(fpr_gbc, tpr_gbc, color='darkgreen',label='{} (AUC  = {})'.format(gbc.__class__.__name__,np.round(auc_test_gbc,decimals=2)))
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");


## Other classifiers
We have only seen tree-based classifiers from scikit-learn above. There are many more types of classifiers:

1.   implemented in scikit-learn: see the [user's guide](https://scikit-learn.org/stable/user_guide.html) for A LOT of different algorithms
2.   in various other packages:
- for decision trees: [XGBoost](https://xgboost.readthedocs.io/en/stable/), [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/)

- for neural networks: [TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/)


### XGBoost

In [None]:
# preinstalled version 0.9.0 20211119
!pip install xgboost --upgrade # install 1.5.0 20211119

In [None]:
from xgboost import XGBClassifier
# tree_method="hist" is 10 times faster, however less robust against awkwards features (not a bad idea to double check without it)
# can even try tree_method="gpu_hist" if proper GPU installation
# use_label_encoder and eval_metric to silence warning in >1.3.0
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')

xgb.fit(X_train, y_train) # note that XGB 1.3.X requires positive weights



In [None]:
print("predict: \n",xgb.predict(X_test[:5]))
print("predict_proba: \n",xgb.predict_proba(X_test[:5]))

In [None]:
y_preds_xgb = xgb.predict_proba(X_test)[:,1].reshape(-1)
fpr_xgb,tpr_xgb,_ = roc_curve(y_true=y_test, y_score=y_preds_xgb)
auc_test_xgb = roc_auc_score(y_true=y_test, y_score=y_preds_xgb)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

### LightGBM

In [None]:
# preinstalled version 2.2.3 20211119
!pip install lightgbm --upgrade # install 3.3.1 20211119
import lightgbm as lgb

In [None]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train)

In [None]:
print("predict: \n",gbm.predict(X_test[:5]))
print("predict_proba: \n",gbm.predict_proba(X_test[:5]))

In [None]:
y_preds_gbm = gbm.predict_proba(X_test)[:,1].reshape(-1)
fpr_gbm,tpr_gbm,_ = roc_curve(y_true=y_test, y_score=y_preds_gbm)
auc_test_gbm = roc_auc_score(y_true=y_test, y_score=y_preds_gbm)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot(fpr_gbm, tpr_gbm, color='darkgreen',label='{} (AUC  = {})'.format("LightGBM",np.round(auc_test_gbm,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

### CatBoost

In [None]:
# not preinstalled 20211119
!pip install catboost # install 1.0.3 20211119
import catboost

In [None]:
cat = catboost.CatBoostClassifier()
cat.fit(X_train, y_train)

In [None]:
print("predict: \n",cat.predict(X_test[:5]))
print("predict_proba: \n",cat.predict_proba(X_test[:5]))

In [None]:
y_preds_cat = cat.predict_proba(X_test)[:,1].reshape(-1)
fpr_cat,tpr_cat,_ = roc_curve(y_true=y_test, y_score=y_preds_cat)
auc_test_cat = roc_auc_score(y_true=y_test, y_score=y_preds_cat)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot(fpr_gbm, tpr_gbm, color='darkgreen',label='{} (AUC  = {})'.format("LightGBM",np.round(auc_test_gbm,decimals=2)))
plt.plot(fpr_cat, tpr_cat, color='red',label='{} (AUC  = {})'.format("CatBoost",np.round(auc_test_cat,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

# Cosmology application

In [None]:
try:
  import google.colab
  COLAB = True # if running in COLAB
except:
  COLAB = False # if not running on COLAB

## Input dataset

Setting up access to data files on Google Drive

### Mount Drive

If you followed pre-AstroInfo instructions, you have already validated access to the folder. If not: before mounting your Google Drive click on [this folder](https://drive.google.com/drive/folders/1PcftgBzBySo1Ync-Wdsp9arTCJ_MfEPE?usp=sharing) and add it to your Google Drive by following these steps:

*   Go to your [Drive ](https://drive.google.com)
*   Find shared folder ("Shared with me" link)
*   Right click on it
*   Click Add to My Drive



In [None]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  pathinData="/content/drive/My Drive/EDE21/morphology"
else:
  # you have to make sure to get the input file locally
  # file of interest for this test:
  #   EDE21/morphology/feature_E_S.npy
  # from the Drive folder reported above
  pathinData="/directory/where/you/stored/feature_E_S.npy"

### Load dataset

In [None]:
import numpy as np
# donwload feature vector and labels
X_ML = np.load(pathinData+'/feature_E_S_large.npy')
#morphological class
Y_ML = np.load(pathinData+'/label_E_S_large.npy')
print ("\nFiles loaded with",X_ML.shape[0], "galaxies ")

If accessing the file properly, you should now see:

`File loaded with 11489 galaxies`

### Switching from numpy to pandas

In [None]:
import pandas as pd
columns=['colour','mass','SersicIndex', 'VelocityDispersion','AxisRatio']
dfall=pd.DataFrame(X_ML,columns=columns)

### Checking the content

In [None]:
#dumping list of features
dfall.columns

In [None]:
#examining first few galaxies
display(dfall.head())

In [None]:
#examining feature distributions
dfall.describe()

## Event selection

Only keep events with exactly two leptons for this exercise.

Only keep events with positive weight, as many ML tools choke on negative weight.

*Note: This is in principle WRONG, only valid if your positive and negative weight events are statistically similar (could then also take the absolute value of the weight to increase statistics).*


In [None]:
print ("Df shape before selection:", dfall.shape)

fulldata=dfall[ (dfall.lep_n==2) & (dfall.mcWeight > 0)]  

print ("Df shape after selection: ",fulldata.shape)

In [None]:
# Hide label and weights in separate vectors (not discriminating features)
# WARNING : there should be neither selection nor shuffling later on! (otherwise misalignement)
target = fulldata["label"]
weights = fulldata["mcWeight"]

# for simplicity only keep some features
# this is actually making a deep copy from fulldata
data=pd.DataFrame(fulldata, columns=["met_et","met_phi","lep_pt_0","lep_pt_1",'lep_phi_0', 'lep_phi_1'])
#data=pd.DataFrame(fulldata, columns=["met_et","met_phi","lep_pt_0","lep_pt_1",'lep_eta_0', 'lep_eta_1', 'lep_phi_0', 'lep_phi_1','jet_n','jet_pt_0',
#       'jet_pt_1', 'jet_eta_0', 'jet_eta_1', 'jet_phi_0', 'jet_phi_1']

print ("Df shape of dataset to be used:",data.shape)

### Feature engineering

Add more complex variables to the dataset.

*Do this later if time permits.*

In [None]:
if False: 
    data["lep_deltaphi"]=np.abs(np.mod(data.lep_phi_1-data.lep_phi_0+3*np.pi,2*np.pi)-np.pi)

    print (data.shape)
    display(data.head())



### Plotting variables

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(12, 5))
data['met_et'].plot.hist(title='Missing Transverse Energy', log=True, ax=ax[0])
data[data.lep_pt_0+data.lep_pt_1>1000]['met_et'].plot.hist(bins=np.linspace(0,400,50),title='Missing Transverse Energy for large lepton Pt', ax=ax[1]);

In [None]:
ax=data[target==0].plot.scatter(x='met_et', y='lep_pt_0',color="b",label="B")
data[target==1].plot.scatter(x='met_et', y='lep_pt_0',color="r",label="S",alpha=.5,ax=ax);

In [None]:
data[data.lep_pt_0+data.lep_pt_1>2000].head()

In [None]:
ax=data[target==0].hist(weights=weights[target==0],figsize=(15,12),bins=50,color='b',alpha=0.5,density=True,label="B")
ax=ax.flatten()[:data.shape[1]] # to avoid error if holes in the grid of plots (like if 7 or 8 features)
data[target==1].hist(weights=weights[target==1],figsize=(15,12),bins=50,color='r',alpha=0.5,density=True,ax=ax,label="S");


### Features correlation matrix

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(12, 5))

corrMatrix = data[target==0].corr()
ax[0].set_title("Background features correlation matrix")
sns.heatmap(corrMatrix.round(3), ax=ax[0], annot=True);

corrMatrix = data[target==1].corr()
ax[1].set_title("Signal features correlation matrix")
sns.heatmap(corrMatrix.round(3), ax=ax[1], annot=True);


## Sample splitting

In [None]:
np.random.seed(31415) # set the random seed (used for the train/test splitting)

from sklearn.model_selection import train_test_split
train_size = 0.75 # fraction of sample used for training
val_size = 0.2 # fraction of training sample used for validation

# split only train/test
#X_train, X_test, y_train, y_test, weights_train, weights_test = \
#    train_test_split(data, target, weights, train_size=train_size)

#split in train/validation/test
X_holdout, X_test, y_holdout, y_test, weights_holdout, weights_test = \
    train_test_split(data, target, weights, train_size=train_size)
X_train, X_val, y_train, y_val, weights_train, weights_val = \
    train_test_split(X_holdout, y_holdout, weights_holdout, train_size=1-val_size)

print("Training sample:  ", X_train.shape)
print("Validation sample:", X_val.shape)
print("Testing sample:   ", X_test.shape)

class_weights_train = (weights_train[y_train == 0].sum(), weights_train[y_train == 1].sum())
print ("class_weights_train (B, S):",class_weights_train)

for i in range(len(class_weights_train)):
    weights_train[y_train == i] *= max(class_weights_train)/ class_weights_train[i] #equalize number of background and signal event
    weights_test[y_test == i] *= 1/(1-train_size) # increase test weight to compensate for sampling
    weights_val[y_val == i] *= 1/val_size/train_size # increase val weight to compensate for samplings
    
print ("Test:  total weight sig", weights_test[y_test == 1].sum())
print ("Test:  total weight bkg", weights_test[y_test == 0].sum())
print ("Train: total weight sig", weights_train[y_train == 1].sum())
print ("Train: total weight bkg", weights_train[y_train == 0].sum())
print ("Val:   total weight sig", weights_val[y_val == 1].sum())
print ("Val:   total weight bkg", weights_val[y_val == 0].sum())



## Network training

In [None]:
try:
  import tensorflow as tf
except ImportError as e:
  !pip install tensorflow
  import tensorflow as tf
print (tf.__version__)  # preinstalled version 2.6.0 20210824
from tensorflow import keras

In [None]:
tf.random.set_seed(1234) # to have reproducible networks
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)), # 1st hidden layer
  #tf.keras.layers.Dense(128, activation='relu'), # 2nd hidden layer
  tf.keras.layers.Dense(1,activation="sigmoid") # output layer
])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              #metrics=['accuracy', keras.metrics.AUC(name="auc")]) # if not using event weights
              weighted_metrics=['accuracy', keras.metrics.AUC(name="auc")])

history = model.fit(X_train, y_train.values,
                    epochs=1,
                    #validation_split=0.2,   # to be used with train/test split
                    validation_data=(X_val, y_val, weights_val),
                    batch_size=1024,
                    sample_weight=weights_train.values,
                    callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

### Standardisation of inputs


In [None]:
from sklearn.preprocessing import StandardScaler

print("Original mean and variance:")
for feature, mean, std in zip(data.columns,X_train.mean(0), X_train.std(0)):
  print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))

# Standardize features by removing the mean and scaling to unit variance
# in training sample
scaler = StandardScaler()
# ".values[:]" to keep dataframe and not convert to numpy array
X_train.values[:] = scaler.fit_transform(X_train)
# apply to testing/validation sample the transformation calculated on training sample
X_test.values[:] = scaler.transform(X_test)
X_val.values[:] = scaler.transform(X_val)

print("\nStandardised mean and variance:")
for feature, mean, std in zip(data.columns,X_train.mean(0), X_train.std(0)):
  print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))

In [None]:
tf.random.set_seed(1234) # to have reproducible networks
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)), # 1st hidden layer
  #tf.keras.layers.Dense(128, activation='relu'), # 2nd hidden layer
  tf.keras.layers.Dense(1,activation="sigmoid") # output layer
])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              #metrics=['accuracy', keras.metrics.AUC(name="auc")]) # if not using event weights
              weighted_metrics=['accuracy', keras.metrics.AUC(name="auc")])

history = model.fit(X_train, y_train.values,
                    epochs=100,
                    #validation_split=0.2,   # to be used with train/test split
                    validation_data=(X_val, y_val, weights_val),
                    batch_size=1024,
                    sample_weight=weights_train.values,
                    callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

y_pred_model = model.predict(X_test).reshape(-1)

*Compare the training loss/accuracy/AUC after epoch 1 with that obtained before standardisation.*

In [None]:
density=True   # normalised to 1 (=> probability density function)
#density=False   # normalised to one year at LHC

plt.hist(y_pred_model[y_test == 0],
         color='b', alpha=0.5, 
         bins=30,
         histtype='stepfilled', density=density,
         label='B (test)', weights=weights_test[y_test == 0])
plt.hist(y_pred_model[y_test == 1],
         color='r', alpha=0.5,
         bins=30,
         histtype='stepfilled', density=density,
         label='S (test)', weights=weights_test[y_test == 1])
plt.legend()
plt.title("NN model score");

In [None]:
my_plot_roc_curve(model, X_test, y_test)

### Training monitoring

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(history.history['loss'],label="Training loss")
ax[0].plot(history.history['val_loss'],label="Validation loss")
ax[0].set_xlabel("Epoch")
ax[0].legend(loc='best');

ax[1].plot(history.history['auc'],label="Training AUC")
ax[1].plot(history.history['val_auc'],label="Validation AUC")
ax[1].set_xlabel("Epoch")
ax[1].legend(loc='best');

### Model saving

***Whole-model saving & loading***

You can save an entire model to a directory. It will include:
- the model's architecture/config
- the model's weight values (which were learned during training)
- the model's compilation information (if `compile()` was called)
- the optimizer and its state, if any (this enables you to restart training where you left)


In [None]:
model.save("NNmodel")
!ls -a NNmodel/*

In [None]:
print("Prediction from original model:")
display(model.predict(X_test[:5]))

reloaded_model=keras.models.load_model("NNmodel")
print("Prediction from reloaded model:")
display(reloaded_model.predict(X_test[:5]))
#np.testing.assert_allclose(
#    model.predict(X_test), reloaded_model.predict(X_test)
#)

# further training
reloaded_model.fit(X_train, y_train.values,
                   epochs=5,
                   #validation_split=0.2,   # to be used with train/test split
                   validation_data=(X_val, y_val, weights_val),
                   batch_size=1024,
                   sample_weight=weights_train.values,
                   callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

*Compare first epoch values with original training.*

***Partial save***

Save a single HDF5 file containing the model's architecture, weights values, and `compile()` information.

Not saved (to be provided separately to resume training):
- external losses & metrics added via `model.add_loss()` & `model.add_metric()`
- computation graph of custom objects

In [None]:
model.save("NNmodel.h5")
!ls -lrt --color
print("\nPrediction from original model:")
display(model.predict(X_test[:5]))

reloaded_model=keras.models.load_model("NNmodel.h5")
print("Prediction from reloaded model:")
display(reloaded_model.predict(X_test[:5]))

***Saving the architecture and weights***

Keeping the model's configuration and training weights in separate files

In [None]:
arch = model.to_json()
with open('NNmodel.json', 'w') as arch_file:
  arch_file.write(arch)
model.save_weights('NNmodel_weights.h5')
!ls -lrt --color

In [None]:
!python -m json.tool NNmodel.json

In [None]:
with open('NNmodel.json', 'r') as f:
  reloaded_model = keras.models.model_from_json(f.read())
reloaded_model.summary()

reloaded_model.load_weights("NNmodel_weights.h5")
#reloaded_model.compile(loss="binary_crossentropy",
#                       optimizer="adam",
#                       #metrics=['accuracy', keras.metrics.AUC(name="auc")]) # if not using event weights
#                       weighted_metrics=['accuracy', keras.metrics.AUC(name="auc")])
reloaded_model.fit(X_train, y_train.values,
                    epochs=1,
                    #validation_split=0.2,   # to be used with train/test split
                    validation_data=(X_val, y_val, weights_val),
                    batch_size=1024,
                    sample_weight=weights_train.values,
                    callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])


## Physics performance

### Significance

Asimov significance (from [arXiv:1007.1727](https://arxiv.org/abs/1007.1727) eq. 97):

> AMS = $\sqrt{2\left((s+b)\ln\left(1+\frac{s}{b}\right) - s\right)} = \frac{s}{\sqrt{b}}\left(1+\mathcal{O}(s/b)\right)$

In [None]:
from math import sqrt
from math import log
def amsasimov(s,b):
  if b<=0 or s<=0:
      return 0
  try:
      return sqrt(2*((s+b)*log(1+float(s)/b)-s))
  except ValueError:
      print(1+float(s)/b)
      print (2*((s+b)*log(1+float(s)/b)-s))

In [None]:
int_pred_test_sig_model = [weights_test[(y_test ==1) & (y_pred_model > th_cut)].sum() for th_cut in np.linspace(0,1,num=50)]
int_pred_test_bkg_model = [weights_test[(y_test ==0) & (y_pred_model > th_cut)].sum() for th_cut in np.linspace(0,1,num=50)]

vamsasimov_model = [amsasimov(sumsig,sumbkg) for (sumsig,sumbkg) in zip(int_pred_test_sig_model,int_pred_test_bkg_model)]
print("Z: ",np.round(max(vamsasimov_model),decimals=3))


In [None]:
plt.plot(np.linspace(0,1,num=50),vamsasimov_model, label='AMS (Z_max = {})'.format(np.round(max(vamsasimov_model),decimals=3)))

plt.title("Significance")
plt.xlabel("Threshold")
plt.ylabel("Significance")
plt.legend()
#plt.savefig("Significance.pdf")
plt.show()

### Feature importance
Feature importance allows to display the importance of each feature without rerunnning the training. It is obtained from internal algorithm quantities, like cumulated decrease of impurity. Magnitude is arbitrary. It can be used as a not very reliable indication of which feature is the most discriminant.

Very straightforward with decision trees.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=10,verbose=1)
gbc.fit(X_train, y_train, sample_weight=weights_train)

In [None]:
plt.bar(data.columns.values, gbc.feature_importances_)
plt.xticks(rotation=90)
plt.title("Feature importance")
plt.show()

*What about a different tree classifier?*

In [None]:
# preinstalled version 0.9.0 20210824
!pip install xgboost --upgrade # install 1.4.2 20210824
import xgboost as xgb
print(xgb.__version__)

useLGB=False #Could also use LightGBM
# preinstalled version 2.2.3 20210824
if useLGB:
  !pip install lightgbm --upgrade # install 3.2.1 20210824
  import lightgbm as lgb
  print (lgb.__version__)

In [None]:
from xgboost import XGBClassifier
# tree_method="hist" is 10 times faster, however less robust against awkwards features (not a bad idea to double check without it)
# can even try tree_method="gpu_hist" if proper GPU installation
# use_label_encoder and eval_metric to silence warning in 1.3.0
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')

xgb.fit(X_train, y_train.values, sample_weight=weights_train.values) # note that XGB 1.3.X requires positive weight



In [None]:
if useLGB:
  gbm = lgb.LGBMClassifier()
  gbm.fit(X_train, y_train.values,sample_weight=weights_train.values)


In [None]:
if useLGB:
  fig,ax=plt.subplots(1, 3, figsize=(18, 5))
else:
  fig,ax=plt.subplots(1, 2, figsize=(12, 5))
ax[0].bar(data.columns.values, xgb.feature_importances_)
ax[0].tick_params(labelrotation=90)
ax[0].set_title("XGBoost feature importance")
ax[1].bar(data.columns.values, gbc.feature_importances_)
ax[1].tick_params(labelrotation=90)
ax[1].set_title("sklearn feature importance");
if useLGB:
  ax[2].bar(data.columns.values, gbm.feature_importances_)
  ax[2].tick_params(labelrotation=90)
  ax[2].set_title("LightGBM feature importance");


### Permutation importance

A better way to show the importance of each feature is Permutation Importance, where each feature in turn is replaced by an instance of an other event (effectively switching it off by randomising).

Works on any classifier, not just DT-based.

However, report can be misleading in case of highly correlated variables.

Available in [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html) but without event weights in Colab version (0.22).
   


In [None]:
if False:
  from sklearn.inspection import permutation_importance
  result_xgb = permutation_importance(xgb, X_test, y_test, n_repeats=1, random_state=42, n_jobs=2)
  forest_importances_xgb = pd.Series(result_xgb.importances_mean, index=list(data.columns.values))

  if useLGB:
    result_gbm = permutation_importance(gbm, X_test, y_test, n_repeats=1, random_state=42, n_jobs=2)
    forest_importances_gbm = pd.Series(result_gbm.importances_mean, index=list(data.columns.values))

  if useLGB:
    fig,ax=plt.subplots(1, 2, figsize=(12, 5))
    forest_importances_xgb.plot.bar(ax = ax[0], subplots=True)
    ax[0].set_title("XGBoost permutation importance")
  else:
    fig,ax=plt.subplots()
    forest_importances_xgb.plot.bar()
    ax.set_title("XGBoost permutation importance")
  if useLGB:
    forest_importances_gbm.plot.bar(ax = ax[1], subplots=True)
    ax[1].set_title("LightGBM permutation importance")


Another implementation targetting HEP:

https://github.com/aghoshpub/permutationImportancePhysics 

In particular it allows to : 
   * use event weights
   * display directly the loss in whatever criterion (ROC auc, asimov significance) when the feature is switched off
   * display the feature importance for a specific subset (for example the most signal like)
   * it can even display which feature has the largest impact on systematics


In [None]:
if False:
    !pip install PermutationImportancePhysics
    from permutationimportancephysics.PermutationImportance import PermulationImportance # note the delibrate typo PermuLation
    #XGBoost
    PI_xgb = PermulationImportance(model=xgb, X=X_test.values,y=y_test,weights=weights_test,\
                           n_iterations=1,usePredict_poba=True, scoreFunction="amsasimov", colNames=list(data.columns.values))
    #PI_xgb.dislayResults()
    plott_xgb = PI_xgb.plotBars()

    #LightGBM    
    if useLGB:
      PI_gbm = PermulationImportance(model=gbm, X=X_test.values,y=y_test,weights=weights_test,\
                             n_iterations=1,usePredict_poba=True, scoreFunction="amsasimov", colNames=list(data.columns.values))
      #PI_gbm.dislayResults()
      plott_gbm = PI_gbm.plotBars()

    # also works with Keras NN
    PI_model = PermulationImportance(model=model, X=X_test.values,y=y_test,weights=weights_test,\
                           n_iterations=1,usePredict_poba=False, scoreFunction="amsasimov", colNames=list(data.columns.values))
    #PI_model.dislayResults()
    plott_model = PI_model.plotBars()
    


### Hyperparameter optimisation
Can be done by hand, with [random search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) or [grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).

Also dedicated packages doing Gaussian process optimisation or 'tree of Parzen estimators' (TPE) (e.g. [hyperopt](https://github.com/hyperopt/hyperop) or [optuna](https://optuna.org/)).

In [None]:
import scipy.stats as stats
if False:
    from sklearn.model_selection import RandomizedSearchCV

    # specify parameters and distributions to sample from
    param_dist_XGB = {'n_estimators': stats.randint(50, 500), #default 100
                      'learning_rate': stats.uniform(0.1, 0.5)} #def 0.3 
                      #'max_depth': stats.randint(3, 12)} # default 6


    # default CV is 5 fold, reduce to 2 for speed concern
    # default n_iter is 10 sets of parameters, reduce to 5 for speed concern
    gsearch = RandomizedSearchCV(estimator = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss'), 
                        param_distributions = param_dist_XGB, 
                        scoring='roc_auc',n_iter=5,cv=2,verbose=2)
    gsearch.fit(X_train,y_train, sample_weight=weights_train)

    print ("Best parameters: ",gsearch.best_params_)
    print ("Best score (on train dataset CV): ",gsearch.best_score_)


    y_pred_gs = gsearch.predict_proba(X_test)[:,1]
    print("... corresponding score on test dataset: ",roc_auc_score(y_true=y_test, y_score=y_pred_gs, sample_weight=weights_test))
    dfsearch=pd.DataFrame.from_dict(gsearch.cv_results_)
    display(dfsearch)
    dfsearch.plot.scatter("param_n_estimators","mean_test_score")
    #dfsearch.plot.scatter("param_max_depth","mean_test_score")
    dfsearch.plot.scatter("param_learning_rate","mean_test_score")