In [None]:
%pylab inline

In [None]:
import numpy
import pandas

In [None]:
hist_kw = dict(bins=60, normed=True, alpha=0.5)

# Folding

**TODO**

* Check if folding scheme can improve the quality
* Compare 2-3-..-10 schemes. Does quality become better while a number of folds grows?
* Plot rocs for all models, plot depedence nfolds vs AUC 

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
data = pandas.read_csv('datasets/training.csv')

In [None]:
data.columns

In [None]:
variables = list(set(data.columns) - {'id', 'min_ANNmuon', 'mass', 'signal', 'production', 'SPDhits'})

In [None]:
# Divide train on train, test
train_index, test_index = train_test_split(range(len(data)))
train = data.iloc[train_index, :]
test = data.iloc[test_index, :]

### FoldingClassifier in sklearn-style from rep

In [None]:
from rep.metaml import FoldingClassifier
from sklearn.ensemble import GradientBoostingClassifier

base_gb = GradientBoostingClassifier(max_depth=6, learning_rate=0.01, n_estimators=200, 
                                     min_samples_leaf=50, max_features=8, subsample=0.7)
folding_example = FoldingClassifier(base_gb, features=variables, n_folds=3)
folding_example.fit(train, train.signal.values)

In [None]:
# train folding schemes for different number of folds
from collections import OrderedDict

# dictionary of models, should contain all your models
folding_all = OrderedDict()
...

### compare roc curves for  the training sample (folding predicts data fold by that classifier which was trained without this fold)

In [None]:
# plot roc curve for all models
...

In [None]:
# plot dependence between number of folds and AUC
...

### Do the same for the test sample (folding takes an average of all classifiers)

In [None]:
...

**Note** In this case we don't need to split into train-test, and can train on the whole data because of folding scheme!

---------

# Blending 
(hierarchy training using the source of the $\tau\to\mu\mu\mu$ decay)

Check on different models if this hierarchy training over a model works better than the model itself.

#### simple training

#### blending training

In [None]:
# Divide train into two stage for hierarchy training
train_index1, train_index2 = train_test_split(range(len(train)))
train1 = train.iloc[train_index1, :]
train2 = train.iloc[train_index2, :]

In [None]:
# prepare data for each tau source
production_data = []
bck = train1[train1.signal == 0]
productions = {1, 2, 4, 5, 6}
for production in productions:
    production_data.append(train1[(train1.signal == 1) & (train1.production == production)])

In [None]:
# train models for each tau source

# add trained model for each tau to the `models`
models = []
...

In [None]:
# predict second stage data and test data by all models 
# add new predictions-features to the variables
variables_blending = variables[:]
for index, model in enumerate(models):
    train2['new_{}'.format(index)] = model.predict_proba(train2[variables])[:, 1]
    test['new_{}'.format(index)] = model.predict_proba(test[variables])[:, 1]
    variables_blending.append('new_{}'.format(index))

In [None]:
# train resulting model using old features and new features
...

In [None]:
# compare ROCs for two schemes
...

In [None]:
# compute AUC for both schemes
print 'Blending', roc_auc_score(test.signal.values, blending_probs)
print 'Simple', roc_auc_score(test.signal.values, simple_probs)

**Result:**
Do you get the significant improvement? Conduct the same experiment for another model of classification.

---------

# Calibrate blending classifier's output to probabilities 

* Platt regression (logistic regression)
* isotonic regression (monotonic function, optimizes $\sum w_i (y_i - \hat{y}_i)^2$)

Nice comment about the output calibration using two methods http://fastml.com/classifier-calibration-with-platts-scaling-and-isotonic-regression/

In [None]:
# divide test into two parts: 
#    the first to calibrate output of the classifer,
#    the second test the quality of the calibration
test_index1, test_index2 = train_test_split(range(len(test)))
test1 = test.iloc[test_index1, :]
test2 = test.iloc[test_index2, :]

### Isotonic regression

In [None]:
from sklearn.isotonic import IsotonicRegression
iso_calib = ...

In [None]:
blending_probs = ... # predictions for the test sample by blending classifier
iso_probs = iso_calib.predict(blending_probs[test_index2])

####compare histograms for the calibrated output and the initial output

#### Compare isotonic calibrated probabilities and estimated probabilities using bins
* Plot isotonic calibrated output for [0, 1]
* Divide output into several bins, for each bin compute $s_i / (s_i + b_i)$ - estimated probability in bin to be a signal event


In [None]:
def plot_probs(temp_probs, name, bins_cal=20):
    bins = linspace(0, 1, bins_cal)
    bins_center = bins[:-1] + (bins[1:] - bins[:-1]) / 2.
    bins_index = numpy.searchsorted(bins[1:-1], temp_probs)
    sig_probs = numpy.bincount(bins_index, weights=test2.signal.values)
    bck_probs = numpy.bincount(bins_index, weights=1-test2.signal.values)
    plot(..., label=name)
    plot([0, 1], [0, 1], label='ideal')

In [None]:
plot_probs(blending_probs[test_index2], 'standard')
plot(..., label='iso calibrated')
legend(loc='best')

### Platt regression

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.special import logit
platt_calib = ...
# transform output from [0, 1] to [-infty, infty] to train Logistic Regression, because of its loss function expression

In [None]:
platt_probs = platt_calib.predict_proba(logit(blending_probs[test_index2]).reshape(-1, 1))[:, 1]

####compare histograms for the calibrated output and the initial output

#### Compare Platt calibrated probabilities and estimated probabilities using bins
* Plot Platt calibrated output for [0, 1]
* Divide output into several bins, for each bin compute $s_i / (s_i + b_i)$ - estimated probability in bin to be a signal event

In [None]:
plot_probs(blending_probs[test_index2], 'standard')
plot(..., label='platt calibrated')
legend(loc='best')

#### Compute AUC, logloss, MSE for initial and calibrated values. How do metrics vary after the calibration?

In [None]:
from sklearn.metrics import log_loss, mean_squared_error

def compute_metrics_for_calibration(probs_initial, probs, name):
    print 'Initial', 'AUC:', roc_auc_score(test2.signal, probs_initial)
    print 'Initial', 'Log loss:', log_loss(test2.signal, probs_initial)
    print 'Initial', 'MSE:', mean_squared_error(test2.signal, probs_initial)
    print name, 'AUC:', roc_auc_score(test2.signal, probs)
    print name, 'Log loss:', log_loss(test2.signal, probs)
    print name, 'MSE:', mean_squared_error(test2.signal, probs)

#### Metrics for isotonic method

In [None]:
compute_metrics_for_calibration(blending_probs[test_index2], iso_probs, 'Isotonic')

#### Metrics for Platt method

In [None]:
compute_metrics_for_calibration(blending_probs[test_index2], platt_probs, 'Platt')

#### Log loss and MSE become lower (better), and AUC also become lower (worse) (for the Platt AUC will the same)

--------

#Hypotheses metrics

Train any model on training data and compare different metrics on the test data:

* $\frac{s} {\sqrt{(s + b)}}$
* $\frac{s} {\sqrt{(10 + b)}}$
* $\frac{s} {\sqrt{(0.1 + b)}}$
* $\frac{s} {(2.5 + \sqrt{b})}$ - Punzi metric

#U-test

Use U-test to compare different ND pdfs

In [None]:
data_agreement = pandas.read_csv('datasets/check_agreement.csv')
data_MC = pandas.concat([data_agreement[data_agreement.signal == 1], data[data.signal == 1]])
data_MC['signal'] = numpy.array([0] * sum(data_agreement.signal.values == 1) + [1] * sum(data.signal.values == 1))

In [None]:
agreement_features = ['LifeTime', 'VertexChi2', 'DOCAtwo']
disagreement_features = ['dira', 'IP', 'IPSig', 'IP_p0p2', 'IP_p1p2', 'isolationb']

In [None]:
# Divide train on train, test
train_MC_index, test_MC_index = train_test_split(range(len(data_MC)))
train_MC = data_MC.iloc[train_MC_index, :]
test_MC = data_MC.iloc[test_MC_index, :]

## U-test function

code the U-statistic and compute number of sigmas: $\frac{U - \mathbb{E}U}{\sqrt{\mathbb{V}U}}$

In [None]:
def u_test_compute(labels, probs):
    # should return number of sigmas
    ...

## Train the calssifier to distinguish two ND pdfs

### Train model on the agreement features

### Train model on the disagreement features

### Compute AUC and U-test sigmas


#### What we can say about the similarity of two ND pdfs for both examples?