## About

In this notebook we prepare a simple solution for the [kaggle challenge on trigger system.](https://inclass.kaggle.com/c/trigger-system)

In [1]:
import pandas
import numpy
from rep.utils import train_test_split_group
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

### Download data

In [2]:
!cd datasets; wget -O training.csv -nc --no-check-certificate https://2016.mlhep.yandex.net/data/training.csv

wget: /root/miniconda/envs/rep_py2/lib/libcrypto.so.1.0.0: no version information available (required by wget)
wget: /root/miniconda/envs/rep_py2/lib/libssl.so.1.0.0: no version information available (required by wget)
File `training.csv' already there; not retrieving.


In [3]:
!cd datasets; wget -O test.csv -nc --no-check-certificate https://2016.mlhep.yandex.net/data/test.csv

wget: /root/miniconda/envs/rep_py2/lib/libcrypto.so.1.0.0: no version information available (required by wget)
wget: /root/miniconda/envs/rep_py2/lib/libssl.so.1.0.0: no version information available (required by wget)
File `test.csv' already there; not retrieving.


### Read training and test files

In [4]:
data = pandas.read_csv('datasets/training.csv')
test = pandas.read_csv('datasets/test.csv')

In [177]:
data.head()

Unnamed: 0,EventID,Label,Mass,Corrected_mass,Pt,Pt_sum,Pt_min,IP_chi2,IP_chi2_sum,Flight_distance,Pseudorapidity,Track_number_PV,Tracks_number,Tracks_number_passed,Vertex_chi2,Weight
0,0,1,2461.369995,3958.329995,5521.529995,5553.419995,2351.519995,116.829995,1457.219995,3004.729995,3.267565,0,2,2,0.000095,2.828167
1,1,1,3361.719998,3853.869998,4022.719998,4362.719998,936.834998,1.512528,259.576998,289.585998,3.461578,0,2,1,1.151478,2.828167
2,1,1,790.218991,5305.539991,4305.589991,4309.419991,891.455991,190.981991,270.212991,252.402991,3.612741,0,2,1,0.179642,2.828167
3,1,1,2631.380000,11106.300000,3147.820000,3840.460000,419.138000,82.668400,243.689000,242.731000,3.459510,1,2,1,2.147980,2.828167
4,1,1,4343.369999,6410.599999,3811.409999,4777.559999,421.900999,6.995589,263.454999,266.133999,3.851009,1,3,1,8.621519,2.828167
5,2,0,1210.270002,6930.040002,3615.460002,3789.090002,307.650002,15.843002,23.200202,17.135202,4.378792,1,2,1,2.685592,1.000000
6,2,0,1387.900006,2018.480006,5021.050006,5068.800006,1599.890006,2.830546,74.858306,173.240006,3.821076,0,2,2,1.191546,1.000000
7,3,1,3046.500007,6764.700007,9374.140007,9749.050007,3590.260007,53.740507,163.604007,397.543007,3.157587,1,2,1,0.015300,6.668962
8,3,1,2743.489990,5837.369990,6512.899990,6967.319990,873.714990,81.895690,355.404990,479.111990,3.143100,0,2,1,0.037786,6.668962
9,3,1,1150.369999,2797.529999,4378.589999,4513.949999,870.681999,29.449299,203.115999,349.560999,3.159029,1,2,0,0.434634,6.668962


### Define training features

Exclude `EventID`, `Label` and `Weight` from the features set

In [6]:
features = list(set(data.columns) - {'EventID', 'Label', 'Weight'})
features

['IP_chi2_sum',
 'Flight_distance',
 'Pt',
 'Tracks_number',
 'Pt_sum',
 'Corrected_mass',
 'Track_number_PV',
 'Pseudorapidity',
 'Tracks_number_passed',
 'Mass',
 'IP_chi2',
 'Pt_min',
 'Vertex_chi2']

### Divide training data into 2 parts
Here `train_test_split_group` function is used to divide into 2 parts to preserve secondary vertices from the same events in the same part of data (training or test). First argument should be events ids.

In [184]:
training_data, validation_data = train_test_split_group(data.EventID, data, random_state=11, train_size=0.66)

### Simple gradient boosting from `sklearn` training

We take all secondary vertices (SVs) for all events and train on them.

In [None]:
#gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=0.8, random_state=13,
#                                min_samples_leaf=100, max_depth=6, max_features=8)
#gb.fit(training_data[features], training_data.Label)

from sklearn.svm import SVC
gb = SVC(gamma=0.00000001, C=4, probability=True)
gb.fit(training_data[features], training_data.Label)

### Prepare predictions, labels and weights for events (not for SVs!) on the cross-validation sample

In [180]:
def compute_mean(event_ids, values):
    """ fore each event computes average of given values """
    number_of_sv_in_event = numpy.bincount(event_ids)
    return numpy.bincount(event_ids, weights=values) / number_of_sv_in_event

In [181]:
# Example of usage
compute_mean(event_ids=[0, 1, 3, 1], values=[1, 2, 3, 4])

array([  1.,   3.,  nan,   3.])

In [182]:
# predict each SV
proba = gb.predict_proba(validation_data[features])
events_ids = numpy.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = numpy.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_mean(validation_data.EventID, proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

### ROC AUC for events (with weights) on the cross validation sample

In [183]:
roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

0.84339175832102187

## Prepare submission to kaggle

In [None]:
# predict each SV in test sample
kaggle_proba = gb.predict_proba(test[features])

kaggle_ids = numpy.unique(test.EventID)
# compute predictions for events (take the mean value of predictions for SVs forming an event)
kaggle_events_proba = compute_mean(test.EventID, kaggle_proba[:, 1])[kaggle_ids]

In [None]:
from IPython.display import FileLink
def create_solution(ids, proba, filename='baseline.csv'):
    """saves predictions to file and provides a link for downloading """
    pandas.DataFrame({'EventID': ids, 'Label': proba}).to_csv('datasets/{}'.format(filename), index=False)
    return FileLink('datasets/{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_events_proba)