<a href="https://colab.research.google.com/github/yanncoadou/MLtutorials/blob/main/AstroInfo2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>AstroInfo 2021 Machine learning hands-on</h1>

# Standard imports and practical functions

In [None]:
# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_circles
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score, roc_auc_score, roc_curve, RocCurveDisplay

%matplotlib inline
import seaborn as sns # seaborn for nice plots
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
np.random.seed(31415) # set the np random seed for reproducibility

### Function to plot decision contours

In [None]:
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

def my_plot_decision_regions(model, X, y, alpha=1.0, size=25, npts=10000, zoom=0.25, event5=False):
  x1min = X[:,0].min() - zoom
  x1max = X[:,0].max() + zoom

  x2min = X[:,1].min() - zoom
  x2max = X[:,1].max() + zoom
  
  x1 = np.random.uniform(x1min, x1max, npts)
  x2 = np.random.uniform(x2min, x2max, npts)

  if hasattr(model, "predict_proba"):
    z = model.predict_proba(np.vstack((x1,x2)).T)
  else:
    z = model.predict(np.vstack((x1,x2)).T)
  
  if len(z.shape) == 2:
    if z.shape[1] == 1:
      z = z.reshape(-1)
    elif z.shape[1] == 2:
      z = z[:,1].reshape(-1)

  fig, ax = plt.subplots()

  bottom = cm.get_cmap('Oranges', 128)
  top = cm.get_cmap('Blues_r', 128)

  newcolors = np.vstack((top(np.linspace(0, 1, 128+128)[-128:]),
                        bottom(np.linspace(0, 1, 128+128)[:128])))
  newcmp = ListedColormap(newcolors, name='OrangeBlue')


  ax.tricontour(x1, x2, z, levels=np.linspace(0.0-np.finfo(float).eps,1.0+np.finfo(float).eps,20,True), linewidths=0.1, colors='k', antialiased=True)
  cntr = ax.tricontourf(x1, x2, z, levels=np.linspace(0.0-np.finfo(float).eps,1.0+np.finfo(float).eps,20,True), cmap=newcmp)
  sctr0 = ax.scatter(X[y==0][:,0], X[y==0][:,1], alpha=alpha, s=size, c="#1f77b4", marker="s", edgecolors="k", linewidths=0.5)
  sctr1 = ax.scatter(X[y==1][:,0], X[y==1][:,1], alpha=alpha, s=size, c="#ff7f0e",  marker="^", edgecolors="k", linewidths=0.5)
  if event5: # showing particular swinger event
    sctr2 = ax.scatter(X[4][0], X[4][1], alpha=1, s=size*10, c="lightgreen",  marker="X", edgecolors="k", linewidths=1)
  fig.colorbar(cntr, ax=ax)
  # ax.set(xlim=(x1min, x1max), ylim=(x2min, x2max))

  plt.show()

### Function to plot ROC curve

In [None]:
def my_plot_roc_curve(model, X_test, y_test):
  if hasattr(model, "predict_proba"):
    y_scores = model.predict_proba(X_test)
  else:
    y_scores = model.predict(X_test)

  if len(y_scores.shape) == 2:
    if y_scores.shape[1] == 1:
      y_scores = y_scores.reshape(-1)
    elif y_scores.shape[1] == 2:
      y_scores = y_scores[:,1].reshape(-1)
  fpr, tpr, _ = roc_curve(y_test, y_scores)
  roc_auc = roc_auc_score(y_test, y_scores)
  plt.clf()
  display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=model.__class__.__name__)
  display.plot()
  plt.plot([0, 1], [0, 1], color='black', linestyle='--')
  plt.show()

# Defining datasets

In [None]:
# X = (x,y) coordinates; y = class
X1, y1 = make_circles(n_samples=1000, noise=0.1, factor=0.8)
X2, y2 = make_circles(n_samples=1000, noise=0.2, factor=0.2)
X = np.vstack((X1,X2/2))
y = np.hstack((y1,y2))

# Splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y);

# Classifier zoo

Play with various tree-based algorithms as implemented in scikit-learn.

## Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
display(dtc.get_params())

In [None]:
dtc.fit(X_train, y_train);

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(15,10))
plot_tree(dtc)
plt.show();


---
How often is the prediction of the decision tree correct? Measured with *accuracy*.

Note: MANY other measures of performance, see e.g. what is available in [scikit-learn](https://scikit-learn.org/stable/modules/model_evaluation.html).

In [None]:
print("Accuracy:",accuracy_score(y_test, dtc.predict(X_test)))

---
Access to results:
- `predict` returns the class (0 or 1 if binary classifier)
- `predict_proba` returns the probability of each class (if available)



In [None]:
print("predict: \n",dtc.predict(X_test[:5]))
print("predict_proba: \n",dtc.predict_proba(X_test[:5]))

In [None]:
try:
  from mlxtend.plotting import plot_decision_regions
except ImportError as e:
  !pip install mlxtend
  from mlxtend.plotting import plot_decision_regions

In [None]:
# practical but limited contour-plotting function
plot_decision_regions(X_test, y_test, dtc);

In [None]:
# defined at top of notebook
# can use class (0 or 1) or class probability when available
my_plot_decision_regions(dtc, X_test, y_test)

---
Receiver operating characteristic curve (ROC curve) and area under the curve (AUC).

<center> <img style="display: block; margin-left: auto; margin-right: auto; width: 30%;" alt="ROCcurve" width="30%" src="https://raw.githubusercontent.com/yanncoadou/MLtutorials/main/ROCcurve.png" > </center>


In [None]:
my_plot_roc_curve(dtc, X_test, y_test)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#abc = AdaBoostClassifier()
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),n_estimators=100)
display(abc.get_params())

In [None]:
abc.fit(X_train, y_train);

In [None]:
print("predict: \n",abc.predict(X_test[:5]))
print("predict_proba: \n",abc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(abc, X_test, y_test)
my_plot_roc_curve(abc, X_test, y_test)

## Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=400,verbose=1)
display(gbc.get_params())

In [None]:
gbc.fit(X_train, y_train)

In [None]:
print("predict: \n",gbc.predict(X_test[:5]))
print("predict_proba: \n",gbc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(gbc, X_test, y_test, event5=True)
my_plot_roc_curve(gbc, X_test, y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=400,verbose=1)
display(rfc.get_params())

In [None]:
rfc.fit(X_train, y_train)

In [None]:
print("predict: \n",rfc.predict(X_test[:5]))
print("predict_proba: \n",rfc.predict_proba(X_test[:5]))

In [None]:
my_plot_decision_regions(rfc, X_test, y_test)
my_plot_roc_curve(rfc, X_test, y_test)

## Comparison

In [None]:
y_preds_dtc = dtc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_abc = abc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_gbc = gbc.predict_proba(X_test)[:,1].reshape(-1)
y_preds_rfc = rfc.predict_proba(X_test)[:,1].reshape(-1)
fpr_dtc,tpr_dtc,_ = roc_curve(y_true=y_test, y_score=y_preds_dtc)
fpr_abc,tpr_abc,_ = roc_curve(y_true=y_test, y_score=y_preds_abc)
fpr_gbc,tpr_gbc,_ = roc_curve(y_true=y_test, y_score=y_preds_gbc)
fpr_rfc,tpr_rfc,_ = roc_curve(y_true=y_test, y_score=y_preds_rfc)
auc_test_dtc = roc_auc_score(y_true=y_test, y_score=y_preds_dtc)
auc_test_abc = roc_auc_score(y_true=y_test, y_score=y_preds_abc)
auc_test_gbc = roc_auc_score(y_true=y_test, y_score=y_preds_gbc)
auc_test_rfc = roc_auc_score(y_true=y_test, y_score=y_preds_rfc)
plt.plot(fpr_dtc, tpr_dtc, color='darkblue',label='{} (AUC  = {})'.format(dtc.__class__.__name__,np.round(auc_test_dtc,decimals=2)))
plt.plot(fpr_abc, tpr_abc, color='darkred',label='{} (AUC  = {})'.format(abc.__class__.__name__,np.round(auc_test_abc,decimals=2)))
plt.plot(fpr_gbc, tpr_gbc, color='darkgreen',label='{} (AUC  = {})'.format(gbc.__class__.__name__,np.round(auc_test_gbc,decimals=2)))
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");


## Other classifiers
We have only seen tree-based classifiers from scikit-learn above. There are many more types of classifiers:

1.   implemented in scikit-learn: see the [user's guide](https://scikit-learn.org/stable/user_guide.html) for A LOT of different algorithms
2.   in various other packages:
- for decision trees: [XGBoost](https://xgboost.readthedocs.io/en/stable/), [LightGBM](https://lightgbm.readthedocs.io/en/latest/), [CatBoost](https://catboost.ai/)

- for neural networks: [TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/)


### XGBoost

In [None]:
# preinstalled version 0.9.0 20211119
!pip install xgboost --upgrade # install 1.5.0 20211119

In [None]:
from xgboost import XGBClassifier
# tree_method="hist" is 10 times faster, however less robust against awkwards features
#   (not a bad idea to double check without it)
# Can even try tree_method="gpu_hist" if proper GPU installation
# use_label_encoder and eval_metric to silence warning in >1.3.0
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')

xgb.fit(X_train, y_train) # note that XGB 1.3.X requires positive weights



In [None]:
print("predict: \n",xgb.predict(X_test[:5]))
print("predict_proba: \n",xgb.predict_proba(X_test[:5]))

In [None]:
y_preds_xgb = xgb.predict_proba(X_test)[:,1].reshape(-1)
fpr_xgb,tpr_xgb,_ = roc_curve(y_true=y_test, y_score=y_preds_xgb)
auc_test_xgb = roc_auc_score(y_true=y_test, y_score=y_preds_xgb)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

### LightGBM

In [None]:
# preinstalled version 2.2.3 20211119
!pip install lightgbm --upgrade # install 3.3.1 20211119
import lightgbm as lgb

In [None]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train);

In [None]:
print("predict: \n",gbm.predict(X_test[:5]))
print("predict_proba: \n",gbm.predict_proba(X_test[:5]))

In [None]:
y_preds_gbm = gbm.predict_proba(X_test)[:,1].reshape(-1)
fpr_gbm,tpr_gbm,_ = roc_curve(y_true=y_test, y_score=y_preds_gbm)
auc_test_gbm = roc_auc_score(y_true=y_test, y_score=y_preds_gbm)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot(fpr_gbm, tpr_gbm, color='darkgreen',label='{} (AUC  = {})'.format("LightGBM",np.round(auc_test_gbm,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

### CatBoost

In [None]:
# not preinstalled 20211119
!pip install catboost # install 1.0.3 20211119
import catboost

In [None]:
cat = catboost.CatBoostClassifier()
cat.fit(X_train, y_train)

In [None]:
print("predict: \n",cat.predict(X_test[:5]))
print("predict_proba: \n",cat.predict_proba(X_test[:5]))

In [None]:
y_preds_cat = cat.predict_proba(X_test)[:,1].reshape(-1)
fpr_cat,tpr_cat,_ = roc_curve(y_true=y_test, y_score=y_preds_cat)
auc_test_cat = roc_auc_score(y_true=y_test, y_score=y_preds_cat)
plt.plot(fpr_rfc, tpr_rfc, color='darkorange',label='{} (AUC  = {})'.format(rfc.__class__.__name__,np.round(auc_test_rfc,decimals=2)))
plt.plot(fpr_xgb, tpr_xgb, color='purple',label='{} (AUC  = {})'.format("XGBoost",np.round(auc_test_xgb,decimals=2)))
plt.plot(fpr_gbm, tpr_gbm, color='darkgreen',label='{} (AUC  = {})'.format("LightGBM",np.round(auc_test_gbm,decimals=2)))
plt.plot(fpr_cat, tpr_cat, color='red',label='{} (AUC  = {})'.format("CatBoost",np.round(auc_test_cat,decimals=2)))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right");

# Cosmology application

In [None]:
try:
  import google.colab
  COLAB = True # if running in COLAB
except:
  COLAB = False # if not running on COLAB

## Input dataset

Setting up access to data files on Google Drive

### Mount Drive

If you followed pre-AstroInfo instructions, you have already validated access to the folder. If not: before mounting your Google Drive click on [this folder](https://drive.google.com/drive/folders/1PcftgBzBySo1Ync-Wdsp9arTCJ_MfEPE?usp=sharing) and add it to your Google Drive by following these steps:

*   Go to your [Drive ](https://drive.google.com)
*   Find shared folder ("Shared with me" link)
*   Right click on it
*   Click Add to My Drive



In [None]:
if COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  pathinData="/content/drive/My Drive/EDE21/morphology"
else:
  # You have to make sure to get the input files locally.
  # Files of interest for this session:
  #   EDE21/morphology/feature_E_S.npy
  #   EDE21/morphology/label_E_S.npy
  # from the Drive folder reported above
  pathinData="/directory/where/you/stored/{feature,label}_E_S.npy"

### Load dataset

In [None]:
import numpy as np
# donwload feature vector and labels
X_ML = np.load(pathinData+'/feature_E_S_large.npy')
#morphological class
Y_ML = np.load(pathinData+'/label_E_S_large.npy')
print ("\nFiles loaded with",X_ML.shape[0], "galaxies ")

If accessing the files properly, you should now see:

`File loaded with 11489 galaxies`

### Switching from numpy to pandas

Just to play a bit with pandas. Input files are in numpy binary format, with no header. Let's fix this.

In [None]:
import pandas as pd
columns=['Colour','Mass','SersicIndex', 'VelocityDispersion','AxisRatio']
dfall=pd.DataFrame(X_ML,columns=columns)

### Checking the content

In [None]:
#dumping list of features
dfall.columns

In [None]:
#examining first few galaxies
display(dfall.head())
display(Y_ML)

In [None]:
#examining feature distributions
dfall.describe()

## Event selection

### Plotting variables

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(12, 5))
dfall['Mass'].plot.hist(title='$Log(M_*)$', log=True, ax=ax[0])
dfall[dfall.Colour>0.1]['Mass'].plot.hist(bins=np.linspace(8,12,50),title='$Log(M_*)$ for Colour>0.2', ax=ax[1]);

In [None]:
ax=dfall[Y_ML==0].plot.scatter(x='Mass', y='Colour',color="b",label="Morph0")
dfall[Y_ML==1].plot.scatter(x='Mass', y='Colour',color="r",label="Morph1",alpha=.1,ax=ax);

In [None]:
ax=dfall[Y_ML==0].hist(figsize=(15,12),bins=50,color='b',alpha=0.5,density=True,label="Morph0")
ax=ax.flatten()[:dfall.shape[1]] # to avoid error if holes in the grid of plots (like if 7 or 8 features)
dfall[Y_ML==1].hist(figsize=(15,12),bins=50,color='r',alpha=0.5,density=True,ax=ax,label="Morph1");

### Feature engineering

Add more complex variables to the dataset.

In [None]:
# adding new variables
dfall["CrazyVar"]=dfall.Colour * dfall.AxisRatio

print (dfall.shape)
display(dfall.head())

In [None]:
# Select events with Colour > 0.1
print ("DataFrame shape before selection:", dfall.shape)

fulldata=dfall[dfall.Colour > 0.1]
# do not forget to synchronise other arrays!
target=Y_ML[dfall.Colour > 0.1]

print ("DataFrame shape after selection: ",fulldata.shape)

In [None]:
# Keeping only a subset of features
data=pd.DataFrame(fulldata, columns=['Colour','Mass','SersicIndex', 'VelocityDispersion','AxisRatio'])

print ("DataFrame shape of dataset to be used:",data.shape)

### Features correlation matrix

In [None]:
fig,ax=plt.subplots(1, 2, figsize=(12, 5))

corrMatrix = data[target==0].corr()
ax[0].set_title("Morph0 features correlation matrix")
sns.heatmap(corrMatrix.round(3), ax=ax[0], annot=True);

corrMatrix = data[target==1].corr()
ax[1].set_title("Morph1 features correlation matrix")
sns.heatmap(corrMatrix.round(3), ax=ax[1], annot=True);


## Sample splitting

In [None]:
np.random.seed(31415) # set the random seed (used for the train/test splitting)

from sklearn.model_selection import train_test_split
train_size = 0.75 # fraction of sample used for training
val_size = 0.2 # fraction of training sample used for validation

# split only train/test
#X_train, X_test, y_train, y_test, weights_train, weights_test = \
#    train_test_split(data, target, weights, train_size=train_size)

#split in train/validation/test
X_holdout, X_test, y_holdout, y_test = \
    train_test_split(data, target, train_size=train_size)
X_train, X_val, y_train, y_val = \
    train_test_split(X_holdout, y_holdout, train_size=1-val_size)

print("Training sample:  ", X_train.shape)
print("Validation sample:", X_val.shape)
print("Testing sample:   ", X_test.shape)

## ML algorithms training

In [None]:
np.random.seed(31415) # set the random seed

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score # for binary classification if x > 0.5 -> 1 else -> 0
# tree_method="hist" is 10 times faster, however less robust against awkwards features (not a bad idea to double check without it)
# can even try tree_method="gpu_hist" if proper GPU installation
# use_label_encoder and eval_metric to silence warning in 1.3.0
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')
# HPO (==Hyper Parameter Optimization), check on the web https://xgboost.readthedocs.io/ for other parameters
#xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,max_depth=10,n_estimators=100) 

import time
starting_time = time.time()

xgb.fit(X_train, y_train)
        
training_time = time.time( ) - starting_time
print("Training time:",training_time)

y_pred_xgb = xgb.predict_proba(X_test)[:,1].ravel()
y_pred_train_xgb = xgb.predict_proba(X_train)[:,1].ravel()
auc_test_xgb = roc_auc_score(y_true=y_test, y_score=y_pred_xgb)
print("AUC test: ",np.round(auc_test_xgb,decimals=3))
print ("AUC train:",np.round(roc_auc_score(y_true=y_train, y_score=y_pred_train_xgb),decimals=3))

### Standardisation of inputs


In [None]:
from sklearn.preprocessing import StandardScaler

print("Original mean and variance:")
for feature, mean, std in zip(data.columns,X_train.mean(0), X_train.std(0)):
  print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))

# Standardize features by removing the mean and scaling to unit variance
# in training sample
scaler = StandardScaler()
# ".values[:]" to keep dataframe and not convert to numpy array
X_train.values[:] = scaler.fit_transform(X_train)
# apply to testing/validation sample the transformation calculated on training sample
X_test.values[:] = scaler.transform(X_test)
X_val.values[:] = scaler.transform(X_val)

print("\nStandardised mean and variance:")
for feature, mean, std in zip(data.columns,X_train.mean(0), X_train.std(0)):
  print("{:9}: {:7.4f} +/- {:7.4f}".format(feature,mean,std))

In [None]:
np.random.seed(31415) # reset the random seed

# redefine the same classifier
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')

starting_time = time.time()

xgb.fit(X_train, y_train)
      
training_time = time.time( ) - starting_time
print("Training time:",training_time)

y_pred_xgb = xgb.predict_proba(X_test)[:,1].ravel()
y_pred_train_xgb = xgb.predict_proba(X_train)[:,1].ravel()
auc_test_xgb = roc_auc_score(y_true=y_test, y_score=y_pred_xgb)
print("AUC test: ",np.round(auc_test_xgb,decimals=3))
print ("AUC train:",np.round(roc_auc_score(y_true=y_train, y_score=y_pred_train_xgb),decimals=3))

Compare performance with previous training

In [None]:
my_plot_roc_curve(xgb, X_test, y_test)

In [None]:
density=True   # normalised to 1 (=> probability density function)
#density=False   # based on test file class balance

plt.hist(y_pred_xgb[y_test == 0],
         color='b', alpha=0.5, 
         bins=30,
         histtype='stepfilled', density=density,
         label='Morph0 (test)')
plt.hist(y_pred_xgb[y_test == 1],
         color='r', alpha=0.5,
         bins=30,
         histtype='stepfilled', density=density,
         label='Morph1 (test)')
plt.legend()
plt.title("XGBoost score");

### Training monitoring

In [None]:
eval_set = [(X_train, y_train), (X_val, y_val)]

# previous training command line:
#  xgb.fit(X_train, y_train)

xgb.fit(X_train, y_train, eval_metric=["logloss","auc","error"], eval_set=eval_set)

## Adding early stopping condition
#xgb.fit(X_train, y_train, eval_metric=["logloss","auc","error"], eval_set=eval_set, early_stopping_rounds=10)


In [None]:
# retrieve performance metrics
results = xgb.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Validation')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()
# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Validation')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()
# plot AUC
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Validation')
ax.legend()
plt.ylabel('AUC')
plt.title('XGBoost Area under the curve (AUC)')
plt.show()

###Learning curve
Compute the AUC by varying the number of training events. Validation set remains the same.

In [None]:
train_sizes=[0.01,0.05,0.1,0.2,0.5,0.75,1]
ntrains=[]
val_aucs=[]
train_aucs=[]
times=[]

for train_size in train_sizes:
  ntrain=int(len(X_train)*train_size)
  print("Training with ",ntrain," events")
  ntrains+=[ntrain]
  starting_time = time.time()

  # train using the first ntrain event of the training dataset
  xgb.fit(X_train[:ntrain], y_train[:ntrain])
  training_time = time.time( ) - starting_time
  times+=[training_time]

  # score on validation dataset (always the same)
  y_val_xgb=xgb.predict_proba(X_val)[:,1]
  auc_val_xgb = roc_auc_score(y_true=y_val, y_score=y_val_xgb)
  val_aucs+=[auc_val_xgb]

  # score on the train dataset 
  y_train_xgb=xgb.predict_proba(X_train[:ntrain])[:,1]
  auc_train_xgb = roc_auc_score(y_true=y_train[:ntrain], y_score=y_train_xgb)
  train_aucs+=[auc_train_xgb]

dflearning=pd.DataFrame({"Ntraining":ntrains,
                         "val_auc":val_aucs,
                         "train_auc":train_aucs,
                         "time":times})
display(dflearning)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 5))
ax[0].grid()
ax[0].plot('Ntraining','train_auc',"o-",data=dflearning,label="Train",color="r")
ax[0].plot(dflearning.Ntraining,dflearning.val_auc,"o-",label="Validation",color="b")
ax[0].set_xlabel("Training examples")
ax[0].set_ylabel("AUC")
ax[0].legend()
ax[1].grid()
ax[1].plot('Ntraining','time',"o-",data=dflearning)
ax[1].legend()
ax[1].set_xlabel("Training examples")
ax[1].set_ylabel("Fit time [s]");

Could also use `learning_curve` in sklearn

*Notes*:
* it does not handle event weights
* it does not allow to control testing dataset size


In [None]:
from sklearn.model_selection import learning_curve
train_sizes,train_scores,test_scores,fit_times,_=learning_curve(
     XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss'),
     X_train,y_train,
     train_sizes=[0.01,0.05,0.1,0.2,0.5,0.75,1],                  
     scoring='roc_auc',cv=5,
     return_times=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 5))
ax[0].set_title('Learning curves')
ax[0].set_xlabel("Training examples")
ax[0].set_ylabel("AUC")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
ax[0].grid()
ax[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.3,
        color="r",
)
ax[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.3,
        color="b",
)
ax[0].plot(train_sizes, train_scores_mean, "o-", color="r", label="Train")
ax[0].plot(train_sizes, test_scores_mean, "o-", color="b", label="Validation")
ax[0].legend(loc="best");

# Plot fit time vs Ntraining
ax[1].grid()
ax[1].plot(train_sizes, fit_times_mean, "o-")
ax[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.3,
)
ax[1].set_xlabel("Training examples")
ax[1].set_ylabel("Fit time [s]")
ax[1].set_title("Scalability of model");


### Model saving

In [None]:
xgb.save_model("XGBoost.model")
!ls -al

Reload a trained model:

In [None]:
print("Prediction from original model:")
display(xgb.predict_proba(X_test[:5]))

reloaded_model=XGBClassifier()
reloaded_model.load_model("XGBoost.model")
print("Prediction from reloaded model:")
display(reloaded_model.predict_proba(X_test[:5]))

try:
  np.testing.assert_allclose(
      xgb.predict_proba(X_test), reloaded_model.predict_proba(X_test)
  )
  print("Original and reloaded models are identical")
except AssertionError:
  print("Watch out: original and reloaded models are different")

## Physics performance

### Feature importance
Feature importance allows to display the importance of each feature without rerunnning the training. It is obtained from internal algorithm quantities, like cumulated decrease of impurity, *during training*. Magnitude is arbitrary. It can be used as a not very reliable indication of which features are the most discriminant *for this particular training*.

Very straightforward with decision trees.

In [None]:
xgb = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss')
xgb.fit(X_train, y_train);

In [None]:
plt.bar(data.columns.values, xgb.feature_importances_)
plt.xticks(rotation=45)
plt.title("Feature importance")
plt.show()

*What about a different tree classifier?*

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10)
gbc.fit(X_train, y_train);

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train);

In [None]:
import catboost
cat = catboost.CatBoostClassifier()
cat.fit(X_train, y_train, verbose=False);

In [None]:
fig,ax=plt.subplots(2, 2, figsize=(18, 10))
ax[0,0].bar(data.columns.values, xgb.feature_importances_)
#ax[0,0].tick_params(labelrotation=45)
ax[0,0].set_title("XGBoost feature importance")
ax[0,1].bar(data.columns.values, gbc.feature_importances_)
#ax[0,1].tick_params(labelrotation=90)
ax[0,1].set_title("sklearn feature importance");
ax[1,0].bar(data.columns.values, gbm.feature_importances_)
#ax[1,0].tick_params(labelrotation=90)
ax[1,0].set_title("LightGBM feature importance");
ax[1,1].bar(data.columns.values, cat.feature_importances_)
#ax[1,1].tick_params(labelrotation=90)
ax[1,1].set_title("CatBoost feature importance");


### Permutation importance

A better way to show the importance of each feature is Permutation Importance, where each feature in turn is replaced by an instance of an other event (effectively switching it off by randomising).

Works on any classifier, not just DT-based. Can be estimated on any sample, not just training set.

However, report can be misleading in case of highly correlated variables.

Available in [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html).
   


In [None]:
from sklearn.inspection import permutation_importance
result_xgb = permutation_importance(xgb, X_val, y_val, n_repeats=1, random_state=42, n_jobs=2)
forest_importances_xgb = pd.Series(result_xgb.importances_mean, index=list(data.columns.values))

result_gbc = permutation_importance(gbc, X_val, y_val, n_repeats=1, random_state=42, n_jobs=2)
forest_importances_gbc = pd.Series(result_gbc.importances_mean, index=list(data.columns.values))

result_gbm = permutation_importance(gbm, X_val, y_val, n_repeats=1, random_state=42, n_jobs=2)
forest_importances_gbm = pd.Series(result_gbm.importances_mean, index=list(data.columns.values))

result_cat = permutation_importance(cat, X_val, y_val, n_repeats=1, random_state=42, n_jobs=2)
forest_importances_cat = pd.Series(result_cat.importances_mean, index=list(data.columns.values))

fig,ax=plt.subplots(2, 2, figsize=(18, 10))
forest_importances_xgb.plot.bar(ax = ax[0,0], subplots=True)
ax[0,0].set_title("XGBoost permutation importance")
forest_importances_gbc.plot.bar(ax = ax[0,1], subplots=True)
ax[0,1].set_title("sklearn permutation importance")
forest_importances_gbm.plot.bar(ax = ax[1,0], subplots=True)
ax[1,0].set_title("LightGBM permutation importance")
forest_importances_cat.plot.bar(ax = ax[1,1], subplots=True)
ax[1,1].set_title("CatBoost permutation importance");


### Hyperparameter optimisation
Can be done by hand, with [random search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) or [grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).

Also dedicated packages doing Gaussian process optimisation or 'tree of Parzen estimators' (TPE) (e.g. [hyperopt](https://github.com/hyperopt/hyperop) or [optuna](https://optuna.org/)).

In [None]:
import scipy.stats as stats
from sklearn.model_selection import RandomizedSearchCV

# specify parameters and distributions to sample from
param_dist_XGB = {'n_estimators': stats.randint(10, 500), #default 100
                  'learning_rate': stats.uniform(0.01, 0.5), #def 0.3 
                  'max_depth': stats.randint(3, 12)} # default 6

# default CV is 5 fold, reduce to 2 for speed concern
# default n_iter is 10 sets of parameters
gsearch = RandomizedSearchCV(estimator = XGBClassifier(tree_method="hist",use_label_encoder=False,eval_metric='logloss'), 
                             param_distributions = param_dist_XGB, 
                             scoring='roc_auc',n_iter=10,cv=2,verbose=2)
gsearch.fit(X_train,y_train);

In [None]:
print ("Best parameters: ",gsearch.best_params_)
print ("Best score (on train dataset CV): ",gsearch.best_score_)
# Best model directly accessible if refit=True (default)
y_pred_gs = gsearch.predict_proba(X_test)[:,1]
print("... corresponding score on test dataset: ",roc_auc_score(y_true=y_test, y_score=y_pred_gs))

dfsearch=pd.DataFrame.from_dict(gsearch.cv_results_)
display(dfsearch.head())

fig,ax=plt.subplots(1, 3, figsize=(15, 5))
dfsearch.plot("param_n_estimators","mean_test_score",yerr="std_test_score",linestyle = 'None',marker="o", ax=ax[0])
ax[0].scatter(gsearch.best_params_['n_estimators'],gsearch.best_score_,color='red',marker="*",s=100,zorder=5)
dfsearch.plot("param_learning_rate","mean_test_score",yerr="std_test_score",linestyle = 'None',marker="o", ax=ax[1])
ax[1].scatter(gsearch.best_params_['learning_rate'],gsearch.best_score_,color='red',marker="*",s=100,zorder=5)
dfsearch.plot("param_max_depth","mean_test_score",yerr="std_test_score",linestyle = 'None',marker="o", ax=ax[2])
ax[2].scatter(gsearch.best_params_['max_depth'],gsearch.best_score_,color='red',marker="*",s=100,zorder=5);