In [None]:
%pylab inline

In [None]:
import numpy
import pandas

In [None]:
hist_kw = dict(bins=60, normed=True, alpha=0.5)

# Stability. Confidence Intervals

Compare ROC curve stability for simple Tree and for any ensemble method. Do they have different confidence intervals for ROC curves and AUC indeed?

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
data = pandas.read_csv('datasets/training.csv')

In [None]:
data.columns

In [None]:
variables = list(set(data.columns) - {'id', 'min_ANNmuon', 'mass', 'signal', 'production'})

### Define function to compute CL for ROC curve and AUC

In [None]:
from sklearn import clone
def compute_CL(models, x_fpr, iterations=30):
    result = {}
    for name, model in models.items():
        aucs = []
        rocs = []
        # repeat training 30 times on different training sample
        for iterations in range(30):
            # divide randomly into train - test samples
            train_ind, test_ind = train_test_split(range(len(data)))
            train_data = data.ix[train_ind, :]
            test_data = data.ix[test_ind, :]
            # training and computing fpr and tpr (use clone method to clone model)
            ...
            # linear interpolation for roc curve
            rocs.append(numpy.interp(x_fpr, fpr, tpr))
            aucs.append(roc_auc_score(test_data.signal.values, probs))
        mean_roc = numpy.mean(rocs, axis=0)
        std_roc = numpy.std(rocs, axis=0)
        print name, numpy.mean(aucs), numpy.std(aucs)
        result[name] = (mean_roc, std_roc)
    return result
        
def plot_roc_CL(x_fpr, mean_roc, std_roc, r_xlim=(0, 1), r_ylim=(0, 1)):
    figsize(10, 8)
    plot(x_fpr, mean_roc, label='mean', color='r')
    plot(x_fpr, mean_roc + std_roc, label='+', color='b')
    plot(x_fpr, mean_roc - std_roc, label='-', color='g')
    legend()
    xlim(r_xlim[0], r_xlim[1])
    ylim(r_ylim[0], r_ylim[1])

In [None]:
fpr_points = numpy.linspace(0, 1, 100)

### Confidence ROC curve intervals for simple tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
simple_tree = DecisionTreeClassifier(...)

### Confidence ROC curve intervals for simple tree 

In [None]:
# ensemble model

In [None]:
# compute results
result = compute_CL({'tree': simple_tree, ...}, fpr_points)

#### ROC CL for tree

In [None]:
plot_roc_CL(fpr_points, result['tree'][0], result['tree'][1])

#### ROC CL for ensemble model

In [None]:
plot_roc_CL(fpr_points, result['GB'][0], result['GB'][1])

# Flatness models

Compare flatness models (uGBFL and knnAdaLoss), trained on the mass, with any ensemble model (1) trained on the mass and 2) without it).

* Do they have comparable qualities? 
* What about CvM values? (check on `data_correlation.csv`)
* Do you see non-flatness? (plot for several thresholds the local efficiency in the mass bin)

**Use**:
    
     from hep_ml.gradientboosting import UGradientBoostingClassifier
     from hep_ml.losses import BinFlatnessLossFunction, 

**Note**:

* `UGradientBoostingClassifier` has parameter `train_features`, in which `mass` (or another flatness vars) should be absent. It use `pandas.DataFrame` as input for fit.
* Loss functions have `uniform_features` parameter, set it to `['mass']`, and uniform_label, for us it should be zero label (bck)

In [None]:
data = pandas.read_csv('datasets/training.csv')
data_correlation = pandas.read_csv('datasets/check_correlation.csv')
train_features = list(set(data_correlation.columns) - {'id', 'signal', 'mass', 'SPDhits'})

In [None]:
# Divide train on train, test
train_index, test_index = train_test_split(range(len(data)))
train = data.iloc[train_index, :]
test = data.iloc[test_index, :]

### Compute the order of non-correlated model

In [None]:
from utils import compute_cvm
compute_cvm(data_correlation.mass.values, numpy.random.random(size=len(data_correlation)))

In [None]:
# define function to test model on cvm and calculate quality
def test_model(model, features):
    model_cvm = model.predict_proba(data_correlation[features])[:, 1]
    model_corr = compute_cvm(data_correlation.mass.values, model_cvm)
    print 'Correlation', model_corr
    print 'AUC', roc_auc_score(test.signal.values, model.predict_proba(test[features])[:, 1])

### Standard model, trained on the mass

###Standard model, trained without mass

### FlatnessLoss with bins approximation for CvM computations

In [None]:
from hep_ml import gradientboosting, losses
from hep_ml.gradientboosting import UGradientBoostingClassifier
# define loss function, here fl_coefficient is flatness coefficient
loss=losses.BinFlatnessLossFunction(uniform_features=['mass'], uniform_label=0, n_bins=20, fl_coefficient=5)
ugb_flatness_loss = UGradientBoostingClassifier(loss=loss, train_features=train_features, 
                                                subsample=0.5, max_features=8, min_samples_leaf=50, max_depth=6)
ugb_flatness_loss.fit(train, train.signal.values)
test_model(ugb_flatness_loss, train_features)

### FlatnessLoss with knn approximation for CvM computations

In [None]:
# define loss function
loss=losses.KnnFlatnessLossFunction(uniform_features=['mass'], uniform_label=0, n_neighbours=50, fl_coefficient=4)
...

### knnAdaLoss

In [None]:
# define loss function
loss=gradientboosting.KnnAdaLossFunction(uniform_features=['mass'], uniform_label=0, knn=10)
...

## Compare efficiencies

In [None]:
def plot_efficiencies(model, features, mass_bins=20):
    figsize(8, 5)
    probs = model.predict_proba(data_correlation[features])[:, 1]
    m_b = data_correlation.mass.values
    mass_p = numpy.percentile(m_b, numpy.linspace(0, 100, mass_bins + 1))
    mass_centers = mass_p[:-1] + (mass_p[1:] - mass_p[:-1]) / 2. 
    bins_index = numpy.searchsorted(mass_p[1:-1], m_b)

    for threshold in numpy.percentile(probs, [20, 40, 60, 80]):
        eff_bins = numpy.bincount(bins_index, weights=(probs > threshold) * 1., minlength=len(mass_centers))
        plot(mass_centers, eff_bins, label=threshold)
    legend()

In [None]:
plot_efficiencies(...)

## Can you combine several models to improve AUC 

Remember that the correlation shoud be less than 0.002 as in the kaggle competition.