In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle
from sklearn.ensemble import AdaBoostClassifier

In [None]:
sig = pd.read_csv('WpWpJJ_EWK_2017_uptoBveto.gz.csv', compression = 'gzip')
bkg = pd.read_csv('Wjets_2017_uptoBveto.gz.csv', compression = 'gzip')

In [None]:
X_sig = sig.drop(columns=['entry','w_nominal','lepSF[0]', 'lepUp[0]', 'lepDown[0]', 'puSF[0]', 'puUp[0]',
       'puDown[0]', 'PFSF[0]', 'PFUp[0]', 'PFDown[0]', 'q2Up[0]', 'q2Down[0]','w_PDF','SF_Fake[0]']).to_numpy()[:3842]
X_bkg = bkg.drop(columns=['Unnamed: 0','w_nominal','lepSF[0]', 'lepUp[0]', 'lepDown[0]', 'puSF[0]', 'puUp[0]',
       'puDown[0]', 'PFSF[0]', 'PFUp[0]', 'PFDown[0]', 'q2Up[0]', 'q2Down[0]','w_PDF','SF_Fake[0]']).to_numpy()[:3842]
y_sig = np.ones(X_sig.shape[0])
y_bkg = - np.ones(X_bkg.shape[0])
X = np.concatenate([X_sig,X_bkg])
y = np.concatenate([y_sig,y_bkg])
X_shuffled, y_shuffled = shuffle(X, y)
X_train, X_test = X_shuffled[:1621], X_shuffled[1621:]
y_train, y_test = y_shuffled[:1621], y_shuffled[1621:]

In [None]:
y_train_ada = y_train
y_train_ada[y_train_ada==-1.]=0
y_train_ada[y_train_ada==1.]=1
y_test_ada = y_test
y_test_ada[y_test_ada==-1.]=0
y_test_ada[y_test_ada==1.]=1
bdt = AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=0.1).fit(X_train, y_train_ada)


print(bdt.score(X_test, y_test_ada))

twoclass_output = bdt.predict_proba(X_test)[:,1]

plot_range = (twoclass_output.min(), twoclass_output.max())
for i, n, c in zip([1.,0.], ['signal','background'], ['blue', 'red']):
    plt.hist(twoclass_output[y_test == i],
             bins=10,
             range=plot_range,
             facecolor=c,
             label= n,
             alpha=.5,
             edgecolor='k')
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Decision Scores')

plt.tight_layout()
plt.subplots_adjust(wspace=0.35)
plt.show()

importance = bdt.feature_importances_
columns = sig.drop(columns=['entry','w_nominal','lepSF[0]', 'lepUp[0]', 'lepDown[0]', 'puSF[0]', 'puUp[0]','puDown[0]', 'PFSF[0]', 'PFUp[0]', 'PFDown[0]', 'q2Up[0]', 'q2Down[0]','w_PDF','SF_Fake[0]']).columns
zipped = zip (columns, importance)
z = list(sorted(zipped, key = lambda t: t[1],reverse=True))
for i in range(0,len(z)):
    print(z[i][0]," ---------> ", z[i][1])


from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test_ada, bdt.decision_function(X_test))
fpr, tpr, _ = roc_curve(y_test_ada, bdt.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(1. - fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([1, 0], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background rejection')
plt.ylabel('Signal efficiency')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()