In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier

from catboost import CatBoostClassifier, Pool

plt.style.use(r'customStyle')

# data processing
import loader as load

# #import the working methods
# import tensorflow as tf
# print("TensorFlow version ",tf.__version__)

# from tensorflow import keras


# from tensorflow.keras import Sequential,Model
# from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
# from tensorflow.keras.utils import plot_model
# from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [7]:
#--------- globals

# Need to see a large portion of the data before we can build a layer, for
# example half of data n_batches_per_layer =  NBATCH_FRAC * NUM_EXAMPLES / BATCH_SIZE
BATCH_SIZE = 1000

# Seed value
# Apparently you may use different seed values at each stage
SEED_VALUE= 10001
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(SEED_VALUE)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(SEED_VALUE)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(SEED_VALUE)

In [10]:
DATA_PATH = r"data/higgs-parsed.h5"

hdata = load.load_data_from_path(DATA_PATH)
data_fnames = hdata['feature_names'].to_numpy()[1:]
n_dims = data_fnames.shape[0]
print("Entries read {} with feature names {}".format(n_dims, data_fnames))

def split_xy_noscale(df):
    y = df['hlabel']
    X = df.drop(['hlabel'], axis=1) 
    return X, y

x_trn_raw, y_trn = split_xy_noscale(hdata['train'])
x_train_raw, x_test_raw, y_train, y_test = train_test_split(
    x_trn_raw, y_trn, test_size=0.1, stratify=y_trn, random_state=42
)
x_val_raw, y_val = split_xy_noscale(hdata['valid'])

scaler = MinMaxScaler().fit(x_train_raw)

def _tf(scaler, X):
    Xt = scaler.transform(X)
    return pd.DataFrame(Xt, columns=X.columns, index=X.index)

x_train = _tf(scaler, x_train_raw)
x_test  = _tf(scaler, x_test_raw)
x_val   = _tf(scaler, x_val_raw)

print("x_train shape: ", x_train.shape)
print("x_test shape: ", x_test.shape)
print("x_val shape: ", x_val.shape)

dt = DecisionTreeClassifier(max_depth=6, random_state=SEED_VALUE)
dt.fit(x_train, y_train)
y_pred = dt.predict_proba(x_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print("Decision Tree AUC: {:.4f}".format(auc))
# 0.7409

Loading data/higgs-parsed.h5...
Loaded.
Entries read 18 with feature names ['lepton-pT' 'lepton-eta' 'missing-energy' 'jet_1-pt' 'jet_1-eta'
 'jet_2-pt' 'jet_2-eta' 'jet_3-pt' 'jet_3-eta' 'jet_4-pt' 'jet_4-eta'
 'm_jj' 'm_jjj' 'm_lv' 'm_jlv' 'm_bb' 'm_wbb' 'm_wwbb']
x_train shape:  (360000, 18)
x_test shape:  (40000, 18)
x_val shape:  (100000, 18)
Decision Tree AUC: 0.7409


In [11]:
DATA_PATH = r"data/higgs-parsed.h5"

hdata = load.load_data_from_path(DATA_PATH)
data_fnames = hdata['feature_names'].to_numpy()[1:]
n_dims = data_fnames.shape[0]
print("Entries read {} with feature names {}".format(n_dims, data_fnames))

def split_xy_noscale(df):
    y = df['hlabel']                       # labels: 0=bkg, 1=sig
    X = df.drop(['hlabel'], axis=1)        # features
    return X, y

x_trn_raw, y_trn = split_xy_noscale(hdata['train'])
x_train_raw, x_test_raw, y_train, y_test = train_test_split(
    x_trn_raw, y_trn, test_size=0.1, stratify=y_trn, random_state=42
)
x_val_raw, y_val = split_xy_noscale(hdata['valid'])

scaler = MinMaxScaler().fit(x_train_raw)

def _tf(scaler, X):
    Xt = scaler.transform(X)
    return pd.DataFrame(Xt, columns=X.columns, index=X.index)

x_train = _tf(scaler, x_train_raw)
x_test  = _tf(scaler, x_test_raw)
x_val   = _tf(scaler, x_val_raw)

print("x_train shape: ", x_train.shape)
print("x_test shape: ", x_test.shape)
print("x_val shape: ", x_val.shape)

bdt = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='AUC',
    random_seed=SEED_VALUE,
    logging_level='Silent',
    use_best_model=True,
    early_stopping_rounds=50
)
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_val, y_val)
bdt.fit(train_pool, eval_set=val_pool, verbose=100)
y_pred = bdt.predict_proba(x_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)
print("CatBoost AUC: {:.4f}".format(auc))
# 0.8194

Loading data/higgs-parsed.h5...
Loaded.
Entries read 18 with feature names ['lepton-pT' 'lepton-eta' 'missing-energy' 'jet_1-pt' 'jet_1-eta'
 'jet_2-pt' 'jet_2-eta' 'jet_3-pt' 'jet_3-eta' 'jet_4-pt' 'jet_4-eta'
 'm_jj' 'm_jjj' 'm_lv' 'm_jlv' 'm_bb' 'm_wbb' 'm_wwbb']
x_train shape:  (360000, 18)
x_test shape:  (40000, 18)
x_val shape:  (100000, 18)
CatBoost AUC: 0.8194
