In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import os
%matplotlib inline

In [2]:
train = pd.read_csv('/notebooks/data/samples_for_comparison/sample_train.csv')
eval = pd.read_csv('/notebooks/data/samples_for_comparison/sample_test.csv')

part_codes = {'Ghost':0, 'Electron':11, 'Muon':13, 'Pion':211, 'Kaon':321, 'Proton':2212}

train_types = np.abs(train['MCParticleType'])
eval_types = np.abs(eval['MCParticleType'])

mc_features = ['Unnamed: 0', 'HasMC', 'MCParticleType', 'MCParticleP', 'MCParticlePt', 'MCVirtualMass', 
               'MCFromB', 'MCFromD', 'MCVertexType', 'MCVertexX', 'MCVertexY', 'MCVertexZ', 
               'piplus_OWNPV_X', 'piplus_OWNPV_Y', 'piplus_OWNPV_Z', 'piplus_OWNPV_XERR', 
               'piplus_OWNPV_YERR', 'piplus_OWNPV_ZERR', 'piplus_OWNPV_CHI2', 'piplus_OWNPV_NDOF', 
               'piplus_IP_OWNPV', 'piplus_IPCHI2_OWNPV', 'nCandidate', 'totCandidates', 'EventInSequence', 
               'TrackHistory', 'TrackType', 'RecoPIDcode']

constant_features = ['TrackMatchChi2', 'TrackCloneDist', 'RichUsedAero']

train.drop(mc_features + constant_features, axis=1, inplace=1)
train.drop('Unnamed: 0.1', axis=1, inplace=1)
eval.drop(mc_features + constant_features, axis=1, inplace=1)
print train.shape
print eval.shape

(120000, 77)
(100000, 77)


In [3]:
config_path = '/notebooks/data/configs/networks/TMVA-Run2-NoTkLikCDVelodEdx/'
def get_features_for(part_type):
    config_file_name = 'GlobalPID_{0}_Long_ANN.txt'.format(part_type)
    config = np.loadtxt(os.path.join(config_path, config_file_name), dtype='S', comments='#')
    return config[5:]

In [4]:
import cPickle as pickle
from rep.estimators.tmva import TMVAClassifier

main_class = 'Muon'
labels = (train_types == part_codes[main_class]).astype(int)
features = get_features_for(main_class)

tmva_kmlp = TMVAClassifier(method='kMLP',
                       factory_options="V:!Silent:!Color:!DrawProgressBar",
                       features=features,
                       H='true',
                       V='true',
                       EpochMonitoring='true',
                       HiddenLayers=int(1.2*len(features)),                       
                       UseRegulator='true',
                       ConvergenceImprove = "1e-16",
                       ConvergenceTests = "15",
                       VarTransform = "Norm",
                       NCycles = 50,
                       NeuronType = "sigmoid",
                       TrainingMethod = "BP",
                       EstimatorType = "CE")
# tmva_kmlp.fit(train, labels)

# with open('models/{0}-{1}.pkl'.format('kMLP', main_class), 'wb') as out:
#     pickle.dump(tmva_kmlp, out, 2)

In [6]:
labels = (train_types == part_codes[main_class]).astype(int)
tmva_kbdt = TMVAClassifier(method='kBDT', 
    factory_options="V:!Silent:!Color:!DrawProgressBar",
    features=features, 
    H='false', 
    V='true', 
    NTrees=50, 
    VarTransform = "Norm", 
    BoostType = "AdaBoost", 
    PruneMethod = "CostComplexity", 
    PruneStrength = -1, 
    MaxDepth = 3, 
    PruningValFraction = 0.3)
tmva_kbdt.fit(train, labels)

with open('models/{0}-{1}.pkl'.format('kBDT', main_class), 'wb') as out:
    pickle.dump(tmva_kbdt, out, 2)