In [1]:
import xgboost
import pandas as pd
import os
import utils

In [2]:
FEATURE_COLUMNS = ['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]',
       'avg_cs[1]', 'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_Y[1]',
       'Lextra_Y[2]', 'Lextra_Y[3]', 'NShared', 'Mextra_DX2[0]',
       'Mextra_DX2[1]', 'Mextra_DX2[2]', 'Mextra_DX2[3]', 'Mextra_DY2[0]',
       'Mextra_DY2[1]', 'Mextra_DY2[2]', 'Mextra_DY2[3]', 'FOI_hits_N']

In [3]:
DATA_PATH = "../MuID_Run_III/extractor"

Reading data takes around 5 minutes. We need to find a better format for the variable-length arrays in FOI*. Possibiities:
1. `pandas.to_hdf` failed with something related to too high number of entries
2. `pandas.to_picke` is possible, but also slow and platform-dependent
3. CERN ROOT should do the job, but will be unfamiliar to participants

In [4]:
train = pd.read_csv(os.path.join(DATA_PATH, "train.csv.gz"), usecols=FEATURE_COLUMNS+["label", "weight"])

In [5]:
test = pd.read_csv(os.path.join(DATA_PATH, "test_public.csv.gz"), usecols=FEATURE_COLUMNS+["label", "weight"])

In [6]:
train.head()

Unnamed: 0,ncl[0],ncl[1],ncl[2],ncl[3],avg_cs[0],avg_cs[1],avg_cs[2],avg_cs[3],ndof,MatchedHit_TYPE[0],...,Mextra_DX2[1],Mextra_DX2[2],Mextra_DX2[3],Mextra_DY2[0],Mextra_DY2[1],Mextra_DY2[2],Mextra_DY2[3],FOI_hits_N,weight,label
0,25.0,8.0,7.0,14.0,1.4,1.25,1.142857,1.571428,8,2.0,...,5029.518,11995.114,24673.143,1660.9003,4950.3394,11806.284,24284.738,5,0.145543,1
1,27.0,5.0,5.0,8.0,2.148148,1.0,1.8,1.25,8,2.0,...,222.80173,495.79556,951.4588,75.818436,216.88867,482.6372,926.20703,4,-0.571444,1
2,22.0,5.0,2.0,18.0,2.636364,3.2,1.0,1.5,8,1.0,...,1135.8945,2581.8015,5005.966,379.30255,1135.3718,2580.613,5003.661,9,0.217224,0
3,25.0,15.0,5.0,4.0,2.04,1.8,1.8,1.0,8,2.0,...,8.44056,18.501749,34.737015,2.933072,8.46011,18.544601,34.81747,6,0.146734,1
4,23.0,4.0,9.0,10.0,1.695652,1.0,1.0,1.2,8,2.0,...,32.87032,73.181076,139.68002,11.306026,32.76765,72.9525,139.24373,4,0.147967,1


In [7]:
train.columns

Index(['ncl[0]', 'ncl[1]', 'ncl[2]', 'ncl[3]', 'avg_cs[0]', 'avg_cs[1]',
       'avg_cs[2]', 'avg_cs[3]', 'ndof', 'MatchedHit_TYPE[0]',
       'MatchedHit_TYPE[1]', 'MatchedHit_TYPE[2]', 'MatchedHit_TYPE[3]',
       'MatchedHit_X[0]', 'MatchedHit_X[1]', 'MatchedHit_X[2]',
       'MatchedHit_X[3]', 'MatchedHit_Y[0]', 'MatchedHit_Y[1]',
       'MatchedHit_Y[2]', 'MatchedHit_Y[3]', 'MatchedHit_Z[0]',
       'MatchedHit_Z[1]', 'MatchedHit_Z[2]', 'MatchedHit_Z[3]',
       'MatchedHit_DX[0]', 'MatchedHit_DX[1]', 'MatchedHit_DX[2]',
       'MatchedHit_DX[3]', 'MatchedHit_DY[0]', 'MatchedHit_DY[1]',
       'MatchedHit_DY[2]', 'MatchedHit_DY[3]', 'MatchedHit_DZ[0]',
       'MatchedHit_DZ[1]', 'MatchedHit_DZ[2]', 'MatchedHit_DZ[3]',
       'MatchedHit_T[0]', 'MatchedHit_T[1]', 'MatchedHit_T[2]',
       'MatchedHit_T[3]', 'MatchedHit_DT[0]', 'MatchedHit_DT[1]',
       'MatchedHit_DT[2]', 'MatchedHit_DT[3]', 'Lextra_X[0]', 'Lextra_X[1]',
       'Lextra_X[2]', 'Lextra_X[3]', 'Lextra_Y[0]', 'Lextra_

In [8]:
train.shape

(12506207, 65)

In [9]:
N_TRAIN = int(1e5)

In [10]:
model = xgboost.XGBClassifier(n_jobs=-1)

In [11]:
model.fit(train.loc[:N_TRAIN, FEATURE_COLUMNS].values,
          train.label.loc[:N_TRAIN],
          sample_weight=train.weight.loc[:N_TRAIN])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [14]:
utils.rejection90_sklearn(model, test[FEATURE_COLUMNS].values, test.label.values, test.weight.values)

0.7123954124904115

In [15]:
model_no_weights = xgboost.XGBClassifier(n_jobs=-1).fit(
    train.loc[:N_TRAIN, FEATURE_COLUMNS].values, train.label.loc[:N_TRAIN])

In [16]:
utils.rejection90_sklearn(model_no_weights, test.loc[:, FEATURE_COLUMNS].values,
                          test.label.values, test.weight.values)

0.7170516795218463