In [8]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [9]:
import pandas
import numpy
import root_numpy
from rep.metaml import FoldingClassifier
from utils import shrink_floats

from utils import compute_labels_and_weights, compute_charges, names_labels_correspondence, labels_names_correspondence
from utils import plot_hist_features, roc_auc_score_one_vs_all
from utils import roc_auc_score_one_vs_all_for_separate_algorithms, compute_cvm_by_particle, plot_flatness_by_particle
import cPickle

## Read the data

In [10]:
data = pandas.DataFrame(root_numpy.root2array('../data/global_train.root', 'tree'))
shrink_floats(data)

In [11]:
data_test = pandas.DataFrame(root_numpy.root2array('../data/global_test.root', 'tree'))
shrink_floats(data_test)

In [12]:
len(data)

5999993

In [13]:
data.head()

Unnamed: 0,VeloCharge,BremPIDe,CaloNeutralPrs,CaloNeutralSpd,InAccBrem,InAccSpd,CaloPrsE,InAccPrs,HcalPIDe,CaloHcalE,...,piplus_OWNPV_XERR,piplus_OWNPV_YERR,piplus_OWNPV_ZERR,piplus_OWNPV_CHI2,piplus_OWNPV_NDOF,piplus_IP_OWNPV,piplus_IPCHI2_OWNPV,nCandidate,totCandidates,EventInSequence
0,1.02809,-999,-999,-999,0,0,-999.0,0,-999.0,-999.0,...,0.0096,0.0096,0.0532,33.816654,83,0.11134,15.622943,16,37,9099
1,1.044008,-999,-999,-999,0,0,-999.0,0,-999.0,-999.0,...,0.0064,0.0063,0.0358,67.432457,187,0.025907,0.510528,38,106,12220
2,0.853933,-999,-999,-999,0,1,155.237808,1,-1.92618,27002.507812,...,0.0108,0.0108,0.0557,29.747982,67,0.136919,5.52092,14,76,8573
3,1.30618,-999,-999,-999,0,1,11.793685,1,0.434916,0.0,...,0.0086,0.0084,0.0453,45.295311,97,0.572526,2.692502,25,50,13449
4,0.969101,-999,-999,-999,0,1,113.548508,1,1.788384,0.0,...,0.0115,0.011,0.0773,33.635342,71,0.101532,4.014179,24,86,1379


#### add signal column (from 0 to 5 classes), weights (to balance data), charges (to check assymetry)

In [14]:
data['Signal'], data['Weight'] = compute_labels_and_weights(data.MCParticleType.values)
data['Charge'] =  compute_charges(data.MCParticleType.values)

In [15]:
data_test['Signal'], data_test['Weight'] = compute_labels_and_weights(data_test.MCParticleType.values)
data_test['Charge'] =  compute_charges(data_test.MCParticleType.values)

#### check initial assymetry in data

In [16]:
print "positive tracks\t", numpy.mean(data.Charge > 0)
print "negative tracks\t", numpy.mean(data.Charge < 0)
print "GHOST tracks\t", numpy.mean(data.Charge == 0)

positive tracks	0.421249824791
negative tracks	0.412084147432
GHOST tracks	0.166666027777


In [17]:
print "positive tracks\t", numpy.mean(data_test.Charge > 0)
print "negative tracks\t", numpy.mean(data_test.Charge < 0)
print "GHOST tracks\t", numpy.mean(data_test.Charge == 0)

positive tracks	0.421374868958
negative tracks	0.411958519931
GHOST tracks	0.166666611111


#### weights

In [18]:
numpy.unique(data.Weight)

array([ 0.99989284,  0.99989484,  0.99989784,  0.99990384,  0.99990484,
        0.99990584])

#### number of tracks for each type

In [19]:
for key, val in names_labels_correspondence.items():
    print '{:10} \t'.format(key), sum(data.Signal == val)

Ghost      	999995
Electron   	1000004
Muon       	1000001
Pion       	1000006
Kaon       	999993
Proton     	999994


# Read features

In [20]:
import json
with open('./features.json', 'r') as f:
    features = json.load(f)
features_original = concatenate(features.values())
print len(features_original)

70


# DLLs experiments

In [50]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(data, random_state=11, train_size=0.4)

In [22]:
features

{u'CALO': [u'CaloBremMatch',
  u'CaloElectronMatch',
  u'CaloTrMatch',
  u'CaloTrajectoryL',
  u'CaloChargedSpd',
  u'CaloChargedPrs',
  u'CaloChargedEcal',
  u'CaloNeutralSpd',
  u'CaloNeutralPrs',
  u'CaloNeutralEcal',
  u'CaloSpdE',
  u'CaloPrsE',
  u'CaloEcalE',
  u'CaloHcalE',
  u'EcalPIDmu',
  u'HcalPIDmu',
  u'PrsPIDe',
  u'BremPIDe',
  u'EcalPIDe',
  u'HcalPIDe'],
 u'DLL': [u'CombDLLmu', u'CombDLLpi', u'CombDLLp', u'CombDLLe', u'CombDLLk'],
 u'RICH': [u'RichAboveMuThres',
  u'RichAboveElThres',
  u'RichAbovePiThres',
  u'RichAboveKaThres',
  u'RichAbovePrThres',
  u'RichUsedR1Gas',
  u'RichUsedR2Gas',
  u'RichDLLbt',
  u'RichDLLpi',
  u'RichDLLe',
  u'RichDLLp',
  u'RichDLLmu',
  u'RichDLLk'],
 u'acceptance': [u'InAccSpd',
  u'InAccPrs',
  u'InAccBrem',
  u'InAccEcal',
  u'InAccHcal',
  u'InAccMuon'],
 u'muon': [u'MuonNShared',
  u'MuonIsLooseMuon',
  u'MuonIsMuon',
  u'MuonBkgLL',
  u'MuonMuLL'],
 u'track': [u'TrackFitVeloChi2',
  u'TrackFitVeloNDoF',
  u'TrackFitMatchChi2',
 

In [29]:
from hep_ml.nnet import MLPMultiClassifier
from rep.metaml import FoldingClassifier

In [30]:
%%time
nn_COMBO = FoldingClassifier(MLPMultiClassifier(layers=(12, 6), scaler='iron', epochs=700),
                             features=features['DLL'] + ['TrackGhostProbability'], random_state=11)
nn_COMBO.fit(train, train.Signal, train.Weight)

CPU times: user 32min 32s, sys: 5min 42s, total: 38min 14s
Wall time: 20min 41s


In [31]:
%%time
nn_RICH = FoldingClassifier(MLPMultiClassifier(layers=(12, 6), scaler='iron', epochs=700),
                            features=['RichDLLbt', 'RichDLLpi', 'RichDLLe', 'RichDLLp', 'RichDLLmu', 'RichDLLk'],
                            random_state=11)
nn_RICH.fit(train, train.Signal, train.Weight)

CPU times: user 32min 20s, sys: 6min 1s, total: 38min 21s
Wall time: 20min 59s


In [32]:
%%time
nn_CALO = FoldingClassifier(MLPMultiClassifier(layers=(12, 6), scaler='iron', epochs=700),
                            features=['PrsPIDe', 'BremPIDe', 'EcalPIDe', 'HcalPIDe', 'EcalPIDmu', 'HcalPIDmu'],
                            random_state=11)
nn_CALO.fit(train, train.Signal, train.Weight)

CPU times: user 32min 59s, sys: 4min 47s, total: 37min 47s
Wall time: 19min 53s


In [33]:
new_p = numpy.concatenate([nn_COMBO.predict_proba(train), 
                           nn_RICH.predict_proba(train), nn_COMBO.predict_proba(train)], axis=1)

KFold prediction using folds column
KFold prediction using folds column
KFold prediction using folds column


In [34]:
%%time
nn_full = FoldingClassifier(MLPMultiClassifier(layers=(12, 6), scaler='iron', epochs=700),
                            random_state=11)
nn_full.fit(new_p, train.Signal, train.Weight)

CPU times: user 37min 32s, sys: 5min 54s, total: 43min 27s
Wall time: 24min 19s


In [37]:
new_p_test = numpy.concatenate([nn_COMBO.predict_proba(test), 
                               nn_RICH.predict_proba(test), nn_COMBO.predict_proba(test)], axis=1)
new_p_test_full = numpy.concatenate([nn_COMBO.predict_proba(data_test), 
                                     nn_RICH.predict_proba(data_test), nn_COMBO.predict_proba(data_test)], axis=1)
pd_test = pandas.DataFrame(nn_full.predict_proba(new_p_test), columns=['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])
pd_test_full = pandas.DataFrame(nn_full.predict_proba(new_p_test_full), columns=['f1', 'f2', 'f3', 'f4', 'f5', 'f6'])

KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)
KFold prediction using random classifier (length of data passed not equal to length of train)


In [55]:
test.index = range(len(test))
test = pandas.concat([test, pd_test], axis=1, )
data_test = pandas.concat([data_test, pd_test_full], axis=1)

In [65]:
dt_pv.keys()

['Ghost']

In [None]:
dt_DLL = {}
for name, label in names_labels_correspondence.items():
    dt_DLL[name] = SklearnClassifier(DecisionTrainClassifier(n_threads=6, depth=7, 
                                    n_estimators=40000, learning_rate=0.1, 
                                    train_features=list(features_original) + ['f1', 'f2', 'f3', 'f4', 'f5', 'f6']))
    dt_DLL[name].fit(test, test.Signal.values == label)
    with open('models/dt_DLL.pkl', 'w') as f:
        cPickle.dump(dt_DLL, f)

# DT with PV features

In [21]:
from decisiontrain import DecisionTrainClassifier
from rep.estimators import SklearnClassifier
from utils import ClassifiersFactoryByClass

In [30]:
from utils import convert_DLL_to_LL, compute_cum_sum

In [31]:
features_RICH_DLL = ['RichDLLbt', 'RichDLLpi', 'RichDLLe', 'RichDLLp', 'RichDLLmu', 'RichDLLk']

In [32]:
def add_constructed_features(data):
    added_features = []
    data_comb = convert_DLL_to_LL(data, features['DLL'])
    data_rich = convert_DLL_to_LL(data, list(set(features_RICH_DLL) - {'RichDLLbt'}))
    data_acceptance = compute_cum_sum(data, features['acceptance'], prefix_name='acc_cum_sum_')
    added_features = list(data_comb.columns) + list(data_rich.columns) + ['acc_cum_sum_3', 'acc_cum_sum_5']
    
    data = pandas.concat([data, data_rich, data_comb, data_acceptance[added_features[-2:]]], axis=1)
    
    data['RichAboveSumPiKaElMuTHres'] = data.RichAbovePiThres + data.RichAboveKaThres + \
        data.RichAboveElThres + data.RichAboveMuThres
    data['RichAboveSumKaPrTHres'] = data.RichAboveKaThres + data.RichAbovePrThres
    data['RichUsedGas'] = data.RichUsedR1Gas + data.RichUsedR2Gas
    data['SpdCaloNeutralAcc'] = data.CaloNeutralSpd + data.InAccSpd # for ghost
    data['SpdCaloChargedAcc'] = data.CaloChargedSpd + data.InAccSpd # for muon
    data['SpdCaloChargedNeutral'] = data.CaloChargedSpd + data.CaloNeutralSpd # for electron
    data['CaloSumSpdPrsE'] = data.CaloSpdE + data.CaloPrsE 
    data['CaloSumPIDmu'] = data.EcalPIDmu + data.HcalPIDmu 
    added_features = added_features + ['RichAboveSumPiKaElMuTHres', 'RichAboveSumKaPrTHres', 'RichUsedGas', 
                                       'SpdCaloNeutralAcc', 'SpdCaloChargedAcc', 'SpdCaloChargedNeutral', 
                                       'CaloSumSpdPrsE', 'CaloSumPIDmu']
    return data, added_features

In [33]:
data_extended, features_constructed = add_constructed_features(data)
data_test_extended, _ = add_constructed_features(data_test)

In [None]:
dt_pv_add = {}
for name, label in names_labels_correspondence.items():
    dt_pv_add[name] = SklearnClassifier(DecisionTrainClassifier(n_threads=14, depth=7, 
                                                                n_estimators=40000, learning_rate=0.1),
                                        features=list(features_original) + features_constructed)
    dt_pv_add[name].fit(data_extended, data_extended.Signal.values == label)
    with open('models/dt_pv_add.pkl', 'w') as f:
        cPickle.dump(dt_pv_add, f)

In [59]:
preds = {}
for label, cl in dt_pv_add.items():
    preds[names_labels_correspondence[label]] = cl.predict_proba(data_test_extended)[:, 1]
    
with open('models/dt_pv_add_probs.pkl', 'w') as f:
    cPickle.dump(preds, f)

In [57]:
for label, cl in dt_pv_add.items():
    print label
    print cl.get_feature_importances().sort_values(by='effect', ascending=False)[:10]
    print '\n'

Ghost
                         effect
TrackGhostProbability  0.109700
TrackPt                0.076400
TrackFitMatchChi2      0.060225
TrackChi2PerDof        0.056450
TrackP                 0.038550
piplus_IPCHI2_OWNPV    0.038200
TrackFitTNDoF          0.037425
TrackFitTChi2          0.032775
TrackNumDof            0.032325
piplus_IP_OWNPV        0.032175


Muon
                         effect
TrackP                 0.071200
TrackFitMatchChi2      0.056575
TrackGhostProbability  0.048000
TrackPt                0.046950
CombDLLmu_LL           0.042000
CaloSumPIDmu           0.035600
MuonIsMuon             0.032375
piplus_IPCHI2_OWNPV    0.031000
MuonBkgLL              0.030125
TrackNumDof            0.023100


Pion
                         effect
CombDLLpi_LL           0.079475
TrackP                 0.061375
TrackPt                0.056525
TrackGhostProbability  0.054075
RichDLLpi_LL           0.042300
RichDLLe               0.031100
TrackFitMatchChi2      0.027000
piplus_IPCHI2_OWNPV 

In [63]:
pandas.DataFrame({'PV features': features['PV']})

Unnamed: 0,PV features
0,piplus_OWNPV_X
1,piplus_OWNPV_Y
2,piplus_OWNPV_Z
3,piplus_OWNPV_XERR
4,piplus_OWNPV_YERR
5,piplus_OWNPV_ZERR
6,piplus_OWNPV_CHI2
7,piplus_OWNPV_NDOF
8,piplus_IP_OWNPV
9,piplus_IPCHI2_OWNPV
