In [759]:
import h5py
import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import shap
import lightgbm as lgb 
import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback
from optuna.pruners import MedianPruner
import xgboost
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow import keras
from sklearn.metrics import mean_absolute_error, r2_score
from tensorflow.keras.metrics import Accuracy, BinaryCrossentropy
from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif, mutual_info_regression, mutual_info_classif, VarianceThreshold
from sklearn import preprocessing
from sklearn.metrics import log_loss

In [760]:
# read data
def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[-1]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('train')
test  = load_data('test')

print (f'Shape of training data set: {train.shape}')
print (f'Shape of test data set: {test.shape}')

all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]


X = train[all_variables]
sc_X = preprocessing.StandardScaler()
X = sc_X.fit_transform(X)
y = train['Truth']

X = pd.DataFrame(X, columns=all_variables)

print (f'Shape of X: {X.shape}')
print (f'Shape of y: {y.shape}')

print(sum(y==0))
print(sum(y==1))

Shape of training data set: (162500, 166)
Shape of test data set: (160651, 164)
Shape of X: (162500, 160)
Shape of y: (162500,)
41005
121495


In [761]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [762]:
# shap value xgbooster

def shap_xgbooster():
    model = xgboost.XGBRegressor().fit(X, y)
    explainer = shap.Explainer(model)
    shap_values = explainer(X)

    feature_names = shap_values.feature_names
    shap_df = pd.DataFrame(shap_values.values, columns=feature_names)
    vals = np.abs(shap_df.values).mean(0)
    shap_importance = pd.DataFrame(list(zip(feature_names, vals)), columns=['col_name', 'feature_importance_vals'])
    shap_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)
    shap.plots.bar(shap_values)

    return shap_importance.head(20)


In [763]:
# feature ranking lgb

def feature_importance_lgb():
    gbm = lgb.LGBMRegressor()
    gbm.fit(X_train, y_train)
    gbm.booster_.feature_importance()

    feature_imp_ = pd.DataFrame({'cols':X_train.columns, 'feature_imp':gbm.feature_importances_})
    feature_imp_.loc[feature_imp_.feature_imp > 0].sort_values(by=['feature_imp'], ascending=False)

    return feature_imp_.loc[feature_imp_.feature_imp > 0].sort_values(by=['feature_imp'], ascending=False).head(20)

feature_importance_lgb()


Unnamed: 0,cols,feature_imp
14,p_d0,141
53,p_deltaPhiRescaled2,138
129,p_ptconecoreTrackPtrCorrection,129
52,p_deltaEta1,124
45,p_Reta,123
15,p_sigmad0,121
46,p_Rphi,106
16,p_d0Sig,90
50,p_Rhad,86
90,p_deltaPhiFromLastMeasurement,83


In [764]:
# def get_feature_importance():
#     model = SelectKBest(mutual_info_classif, k=20)#选择k个最佳特征
#     X_new = model.fit_transform(X, y)
#     #feature_data是特征数据，label_data是标签数据，该函数可以选择出k个特征 
 
#     print("model shape: ",X_new.shape)
 
#     scores = model.scores_
#     print('model scores:', scores)  # 得分越高，特征越重要
 
#     p_values = model.pvalues_
#     print('model p-values', p_values)  # p-values 越小，置信度越高，特征越重要
 
#     # 按重要性排序，选出最重要的 k 个
#     indices = np.argsort(scores)[::-1]
#     k_best_features = list(X.columns.values[indices[0:20]])
 
#     print('k best features are: ',k_best_features)
    
#     return k_best_features

# get_feature_importance()


# def selection_features(X_train, y_train, X_test):
#     select = SelectKBest(score_func=f_regression, k=20)
#     select.fit(X_train, y_train)
    
#     return select

# select = selection_features(X_train, y_train, X_test)
# X = pd.DataFrame(X)
# names = X.columns.values[select.get_support()]
# scores = select.scores_[select.get_support()]
# names_scores = list(zip(names, scores))
# ns_df = pd.DataFrame(data = names_scores, columns=['Feature_names', 'Feature_scores'])
# #Sort the dataframe for better visualization
# ns_df_sorted = ns_df.sort_values(['Feature_names', 'Feature_scores'], ascending = [False, True])
# print(ns_df_sorted)

In [765]:
# feature ranking by xgboost
shap_variables = ['p_Rhad', 'p_Rphi', 'p_Reta', 'p_sigmad0', 'p_deltaEta1', 'p_ptconecoreTrackPtrCorrection', 'p_deltaPhiRescaled2', 'p_d0', 'p_numberOfInnermostPixelHits', 'p_ambiguityType',
                    'p_rawPhiCluster','p_phiCalo', 'p_ethad', 'p_EptRatio', 'p_Rhad1', 'p_E7x11_Lr3', 'p_ehad1', 'p_Eratio', 'p_deltaPhi2', 'p_nTracks']

# split data with shap
X_shap = train[shap_variables]
sc_X_shap = preprocessing.StandardScaler()
X_shap_pre = sc_X_shap.fit_transform(X_shap)

y_shap = train['Truth']
X_shap_train, X_shap_test, y_shap_train, y_shap_test = train_test_split(X_shap, y_shap, test_size=0.2, random_state=12)
X_shap_train_pre, X_shap_test_pre, y_shap_train_pre, y_shap_test_pre = train_test_split(X_shap_pre, y_shap, test_size=0.2, random_state=12)

sc_input_shap = preprocessing.StandardScaler()
input_valid_shap = test[shap_variables]
input_valid_shap = sc_input_shap.fit_transform(input_valid_shap)
input_valid_shap = pd.DataFrame(input_valid_shap, columns=[shap_variables])


# feature ranking by lgb

# non-preprocess
lgb_variables = ['p_sigmad0', 'p_deltaPhiRescaled2', 'p_deltaEta1', 'p_Reta', 'p_d0', 'p_ptconecoreTrackPtrCorrection',
'p_d0Sig', 'p_Rhad', 'p_Rphi', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhi2', 'p_EptRatio', 'p_dPOverP', 'p_numberOfSCTHits',
 'p_numberOfPixelHits', 'p_pt_track', 'p_nTracks', 'p_ethad', 'p_qOverP','p_ambiguityType']


X_lgb = train[lgb_variables]
sc_X_lgb = preprocessing.StandardScaler()
X_lgb_pre = sc_X_lgb.fit_transform(X_lgb)

y_lgb = train['Truth']
X_lgb_train, X_lgb_test, y_lgb_train, y_lgb_test = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=12)
X_lgb_train_pre, X_lgb_test_pre, y_lgb_train_pre, y_lgb_test_pre = train_test_split(X_lgb_pre, y_lgb, test_size=0.2, random_state=12)

X_lgb_train, X_lgb_test, y_lgb_train, y_lgb_test = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=12)
sc_input_lgb = preprocessing.StandardScaler()
input_valid_lgb = test[lgb_variables]
input_valid_lgb = sc_input_lgb.fit_transform(input_valid_lgb)
input_valid_lgb = pd.DataFrame(input_valid_lgb, columns=[lgb_variables])

# feature ranking by kbest
kbest_variables = ['p_ethad', 'p_Rhad', 'p_Rhad1', 'p_ehad1', 'p_ethad1', 'p_deltaEta1', 'p_Reta', 'p_deltaEta2', 'p_Eratio', 'p_E7x11_Lr3', 'p_Rphi', 'p_E7x7_Lr3', 
'p_deltaPhiRescaled2', 'p_E5x7_Lr3', 'p_f3core', 'p_e2tsts1', 'p_e2ts1', 'p_weta2', 'p_E3x5_Lr3', 'p_DeltaE']

X_kbest = train[kbest_variables]
sc_X_kbest = preprocessing.StandardScaler()
X_kbest_pre = sc_X_lgb.fit_transform(X_kbest)

y_kbest = train['Truth']
X_kbest_train, X_kbest_test, y_kbest_train, y_kbest_test = train_test_split(X_kbest, y_kbest, test_size=0.2, random_state=12)
X_kbest_train_pre, X_kbest_test_pre, y_kbest_train_pre, y_kbest_test_pre = train_test_split(X_kbest_pre, y_kbest, test_size=0.2, random_state=12)

sc_input_kbest = preprocessing.StandardScaler()
input_valid_kbest= test[kbest_variables]
input_valid_kbest = sc_input_kbest.fit_transform(input_valid_kbest)
input_valid_kbest = pd.DataFrame(input_valid_kbest, columns=[kbest_variables])

variable_list_shap = pd.DataFrame(shap_variables, columns=['vars'])
variable_list_shap.to_csv('solutions/Classification_HauLamFong_xgboost_VariableList.txt')
variable_list_lgb = pd.DataFrame(lgb_variables, columns=['vars'])
variable_list_lgb.to_csv('solutions/Classification_HauLamFong_lightgbm_VariableList.txt')
variable_list_kbest = pd.DataFrame(kbest_variables, columns=['vars'])
variable_list_kbest.to_csv('solutions/Classification_HauLamFong_neuralnetwork_VariableList.txt')

In [766]:
import optuna.integration.lightgbm as oplgb
import optuna
from sklearn.model_selection import RepeatedKFold


# def objective_lgb():
    
#     train_data = lgb.Dataset(X_lgb_train, label=y_lgb_train)
#     valid_data = lgb.Dataset(X_lgb_test, label=y_lgb_test)
    
#     rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
#     params = {
#             'objective': 'regression',
#             'metric': 'binary_logloss',
#             'boosting_type': 'dart',
#             'max_depth': 10,
#             'learning_rate': 0.2759844445088989,
#             'feature_fraction': 0.8,
#             'num_leaves': 360,
#             'bagging_freq': 1,
#             'bagging_fraction': 0.8,
#             'reg_alpha': 8.200000000000001,
#             'reg_lambda': 103,
#             'verbose':-1,
#             'force_col_wise': True
#     }

#     study_tuner = optuna.create_study(
#     direction="minimize")

#     tuner = oplgb.LightGBMTunerCV(params, train_data, study=study_tuner, num_boost_round=100, folds=rkf, 
#                                 early_stopping_rounds=200, seed=42)

#     tuner.run()

#     return tuner.best_params

# {'objective': 'regression',
#  'metric': 'binary_logloss',
#  'boosting_type': 'dart',
#  'max_depth': 10,
#  'learning_rate': 0.2759844445088989,
#  'feature_fraction': 0.9520000000000001,
#  'num_leaves': 360,
#  'bagging_freq': 1,
#  'bagging_fraction': 0.9993807771263824,
#  'verbose': -1,
#  'force_col_wise': True,
#  'feature_pre_filter': False,
#  'lambda_l1': 8.200000000000001,
#  'lambda_l2': 103,
#  'min_child_samples': 20}



In [767]:
# optimize hyperparameters of lgb
# train_data = lgb.Dataset(X_train, label=y_train)
# valid_data = lgb.Dataset(X_test, label=y_test)

from lightgbm import LGBMRegressor
# predict

# optimize hyperparameters of lgb

# def objective_lgb(trial):

#     boosting_types = ["gbdt", "rf", "dart"]
#     boosting_type = trial.suggest_categorical("boosting_type", boosting_types)

#     params = {
#         'objective': 'binary',
#         'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "rf", "dart"]),
#         'max_depth': trial.suggest_int('max_depth', 2, 100),
#         'min_child_samples': trial.suggest_int('min_child_samples', 0, 1000),
#         'metric': 'mae',
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.95, step=0.1),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20),
#         'bagging_freq': trial.suggest_categorical('bagging_freq', [1]),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.95, step=0.1),
#         'reg_alpha': trial.suggest_float("reg_alpha", 0, 100, step=0.1),
#         'reg_lambda': trial.suggest_int("reg_lambda", 0, 1000, step=1),
#         'verbosity': -1,
#     }

#     lgbm = LGBMRegressor(**params)
#     lgbm.fit(X_lgb_train_pre, y_lgb_train_pre, eval_set=[(X_lgb_test_pre, y_lgb_test_pre)],early_stopping_rounds=100, verbose=False)
#     pred_lgb=lgbm.predict(X_lgb_test_pre)
#     mae = mean_absolute_error(y_lgb_test_pre, pred_lgb)
#     return mae


# study = optuna.create_study(
#     direction="minimize",
#     sampler=TPESampler(seed=42),
#     pruner=MedianPruner(n_warmup_steps=50),
# ) 

# study.optimize(objective_lgb, n_trials=100, show_progress_bar=True)

# study.best_trial.params

{'boosting_type': 'dart',
 'max_depth': 10,
 'learning_rate': 0.2759844445088989,
 'feature_fraction': 0.8,
 'num_leaves': 360,
 'bagging_freq': 1,
 'bagging_fraction': 0.8,
 'reg_alpha': 8.200000000000001,
 'reg_lambda': 103}

In [768]:
# Neural Network

import kerastuner as kt

# class ClearTrainingOutput(tf.keras.callbacks.Callback):
#     def on_train_end(*args, **kwargs):
#         print("训练完成，调用回调方法")

# def model_builder(hp):
#     model = Sequential()
#     # Tune the number of units in the first Dense layer
#     # Choose an optimal value between 32-512
#     hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
#     model.add(Dense(units=hp_units, activation='relu'))
#     model.add(Dense(units=hp_units, activation='relu'))
#     model.add(Dense(units=1, activation='relu'))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#                   metrics=['accuracy'])  # accuracy，用于判断模型效果的函数
#     return model
    
# tuner = kt.Hyperband(model_builder,
#                     objective='val_accuracy',  # 优化的目标，验证集accuracy
#                     max_epochs=10,  # 最大迭代次数
#                     factor=3)

# tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test),
#                  callbacks=[ClearTrainingOutput()])

# tuner.get_best_hyperparameters(num_trials=1)[0]


In [769]:
from tensorboard.plugins.hparams import api as hp


In [770]:
# # hyperparameter tuning for neural network

# HP_NUM_UNITS1 = hp.HParam('num_units1', hp.Discrete([16, 32, 64, 128]))
# HP_NUM_UNITS2 = hp.HParam('num_units2', hp.Discrete([16, 32, 64, 128]))
# HP_NUM_UNITS3 = hp.HParam('num_units3', hp.Discrete([16, 32, 64, 128]))
# HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.2))

# METRIC_ACCURACY = 'accuracy'

# with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
#   hp.hparams_config(
#     hparams=[HP_NUM_UNITS1, HP_NUM_UNITS2, HP_NUM_UNITS3],
#     metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
#   )

# def train_test_model(hparams):
#   model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS1], activation=tf.nn.relu),
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS2], activation=tf.nn.relu),
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS3], activation=tf.nn.relu),
#     tf.keras.layers.Dense(1, activation='sigmoid'),
#   ])
#   model.compile(
#       optimizer='adam',
#       loss='BinaryCrossentropy',
#       metrics=['accuracy'],
#   )

#   model.fit(X_kbest_train, y_kbest_train, epochs=5, batch_size=512) # Run with 1 epoch to speed things up for demo purposes
#   _, accuracy = model.evaluate(X_kbest_test, y_kbest_test)
#   return accuracy

# def run(run_dir, hparams):
#   with tf.summary.create_file_writer(run_dir).as_default():
#     hp.hparams(hparams)  # record the values used in this trial
#     accuracy = train_test_model(hparams)
#     tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

# session_num = 0

# for num_units1 in HP_NUM_UNITS1.domain.values:
#     for num_units2 in HP_NUM_UNITS2.domain.values:
#       for num_units3 in HP_NUM_UNITS3.domain.values:
#             hparams = {
#                 HP_NUM_UNITS1: num_units1,
#                 HP_NUM_UNITS2: num_units2,
#                 HP_NUM_UNITS3: num_units3
#             }
#             run_name = "run-%d" % session_num
#             print('--- Starting trial: %s' % run_name)
#             print({h.name: hparams[h] for h in hparams})
#             run('logs/hparam_tuning/' + run_name, hparams)
#             session_num += 1



In [772]:
def optimized_nn(X_train1, y_train1, X_test1, y_test1, input_valid_kbest):
    model = Sequential([
        Dense(128,activation='relu',name='input_layer'),
        Dense(16,activation='relu',name='hidden_layer1'),
        Dense(32,activation='relu',name='hidden_layer2'),
        Dense(1, activation='sigmoid', name='output')])

    optimizer = tf.keras.optimizers.Adam()
   
    model.compile(optimizer=optimizer,
                loss='BinaryCrossentropy',
                metrics='Accuracy')

    print('--------- TRAINING ---------')
    history = model.fit(x=X_train1, y=y_train1, validation_data=(X_test1, y_test1), epochs = 10, batch_size=512)  
    score = model.evaluate(X_test1,  y_test1, verbose=2)
    y_pred = model.predict(X_test1)
    # y_pred = sc_y.inverse_transform(y_pred)
    y_estimate = model.predict(input_valid_kbest)
    logloss = log_loss(y_test1, y_pred)

    return score, y_estimate, logloss

pred_nn = optimized_nn(X_kbest_train_pre, y_kbest_train_pre, X_kbest_test_pre, y_kbest_test_pre, input_valid_kbest)
print(pred_nn)
solution_nn = pd.DataFrame(data=pred_nn[1], columns=['preds'])
solution_nn.to_csv('solutions/Classification_HauLamFong_neuralnetwork.txt')



--------- TRAINING ---------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1016/1016 - 1s - loss: 0.2207 - Accuracy: 0.9222 - 635ms/epoch - 625us/step
([0.22068017721176147, 0.9221845865249634], array([[9.7067487e-01],
       [9.5303768e-01],
       [9.0532500e-01],
       ...,
       [3.1648385e-03],
       [1.3174102e-04],
       [9.6206820e-01]], dtype=float32), 0.22068000747883024)


In [773]:
# from sklearn.model_selection import RandomizedSearchCV
# from xgboost import XGBRegressor
# from scipy.stats import randint, poisson
# from sklearn.model_selection import cross_val_score
# from bayes_opt import BayesianOptimization

# # hyperparameters tuning for xgboost


# def xgboost_CrossValidation(max_depth, min_child_weight, n_estimators, gamma, subsample, colsample_bytree, booster, objective, learning_rate, eval_metric, data, targets):
#     """Decision Tree cross validation.
#        Fits a Decision Tree with the given paramaters to the target 
#        given data, calculated a CV accuracy score and returns the mean.
#        The goal is to find combinations of max_depth, min_samples_leaf 
#        that maximize the accuracy
#     """
    
#     estimator = XGBRegressor(random_state=42, 
#                                        max_depth=max_depth, 
#                                        min_child_weight=min_child_weight, n_estimators=n_estimators, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, booster=booster, objective=objective, learning_rate=learning_rate, eval_metric=eval_metric)
    
#     cval = cross_val_score(estimator, data, targets, scoring='accuracy', cv=5)
    
#     return cval.mean()


# def optimize_xgboost(data, targets, pars, n_iter=5):
#     """Apply Bayesian Optimization to Decision Tree parameters."""
    
#     def crossval_wrapper(max_depth, min_child_weight, n_estimators, gamma, subsample, colsample_bytree, booster, objective, learning_rate, eval_metric):
#         """Wrapper of Decision Tree cross validation. 
#            Notice how we ensure max_depth, min_samples_leaf 
#            are casted to integer before we pass them along.
#         """
#         return xgboost_CrossValidation(max_depth=int(max_depth), 
#                                             min_child_weight=int(min_child_weight), n_estimators=n_estimators, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, booster=booster, objective=objective, learning_rate=learning_rate, eval_metric=eval_metric,
#                                             data=data, 
#                                             targets=targets)

#     optimizer = BayesianOptimization(f=crossval_wrapper, 
#                                      pbounds=pars, 
#                                      random_state=42, 
#                                      verbose=2)
#     optimizer.maximize(init_points=4, n_iter=n_iter)

#     return optimizer

# params = {
#     'n_estimators':(0,1000),
#     'min_child_weight':(0, 1000), 
#     'gamma':(0, 6),  
#     'subsample':(1, 11),
#     'colsample_bytree':(1, 11), 
#     'max_depth': (3, 100),
#     'objective': ['binary:logistic', 'binary:logitraw'],
#     'booster': ['gbtree', 'gblinear', 'dart'],
#     'eval_metric': ['logloss'],
#     'learning_rate': (0.01, 1),
# }


In [774]:

# BayesianOptimization = optimize_xgboost(X_shap_train_pre, 
#                                              y_shap_train_pre, 
#                                              params, 
#                                              n_iter=5)
# print(BayesianOptimization.max)

In [775]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# def objective_xgboost(trial):
    
#     params = {
#     'n_estimators':trial.suggest_int('n_estimators', 0, 100),
#     'verbosity': 0,
#     'reg_alpha':trial.suggest_int('reg_alpha', 0, 5),
#     'reg_lambda':trial.suggest_int('reg_lambda', 0, 5),
#     'min_child_weight':trial.suggest_int('min_child_weight', 0, 100), 
#     'gamma':trial.suggest_int('gamma', 0, 6),  
#     'subsample':trial.suggest_discrete_uniform('subsample', 0.1, 1, 0.01),
#     'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree', 0.1, 1, 0.01), 
#     'max_depth': trial.suggest_int('max_depth', 3, 100),
#     'objective': trial.suggest_categorical('objective', ['binary:logistic', 'binary:logitraw']),
#     'booster': trial.suggest_categorical('booster',['gbtree', 'gblinear', 'dart']),
#     'eval_metric': ['logloss'],
#     'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
# }

#     model = xgboost.XGBRegressor(**params)
#     model.fit(X_shap_train_pre,y_shap_train_pre, eval_set=[(X_shap_test_pre, y_shap_test_pre)], early_stopping_rounds=50, verbose=False)
#     preds = model.predict(X_shap_test_pre)
#     rmse = mean_squared_error(y_shap_test_pre, preds, squared=False)

#     return rmse

# ntrial = 50
# study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42), pruner=MedianPruner(n_warmup_steps=50))
# study.optimize(objective_xgboost, n_trials=ntrial, show_progress_bar=True)

# print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [776]:
def optimized_xgboost(X_train1, y_train1, X_test1, y_test1, input_valid_shap):

    params = {'n_estimators': 80, 'reg_alpha': 4, 'reg_lambda': 2, 'min_child_weight': 7, 'gamma': 0, 'subsample': 0.86,
     'colsample_bytree': 0.76, 'max_depth': 87, 'objective': 'binary:logistic', 'booster': 'dart', 'learning_rate': 0.08192147965761523}
    model = xgboost.XGBRegressor(**params)
    model.fit(X_train1, y_train1)
    y_pred_xgboost = model.predict(X_test1)
    y_estimate = model.predict(input_valid_shap)
    acc = accuracy_score(np.around(y_pred_xgboost), y_test1)
    logloss = log_loss(y_test1, y_pred_xgboost)
    n_electrons = np.sum(np.round(y_estimate)==1)

    return y_estimate, acc, logloss, n_electrons

pred_xgboost = optimized_xgboost(X_shap_train_pre, y_shap_train_pre, X_shap_test_pre, y_shap_test_pre, input_valid_shap)
print(pred_xgboost)
solution_xg = pd.DataFrame(data=pred_xgboost[0], columns=['preds'])
solution_xg.to_csv('solutions/Classification_HauLamFong_xgboost.txt')

(array([0.9638293 , 0.8908489 , 0.9439875 , ..., 0.05447009, 0.01041053,
       0.9275919 ], dtype=float32), 0.9424615384615385, 0.1453568831990903, 119811)


In [777]:
def optimized_lgb(X_train1, y_train1, X_test1, y_test1, input_valid_lgb):

    params = {'boosting_type': 'gbdt',
                'max_depth': 19,
                'min_child_samples': 139,
                'learning_rate': 0.3851342929999186,
                'feature_fraction': 0.8,
                'num_leaves': 1020,
                'bagging_freq': 1,
                'bagging_fraction': 0.8,
                'reg_alpha': 3.3000000000000003,
                'reg_lambda': 1,
                'objective':'binary'
            }

    lgb_train = lgb.Dataset(X_train1, y_train1)
    lgb_test = lgb.Dataset(X_test1, y_test1)
    lgbm = lgb.train(params, lgb_train,num_boost_round=1000, valid_sets=lgb_test, early_stopping_rounds=100)
    pred_lgb=lgbm.predict(X_test1)
    logloss = log_loss(y_test1, pred_lgb)
    acc = accuracy_score(y_test1, np.around(pred_lgb))
    y_estimate = lgbm.predict(input_valid_lgb)

    
    return y_estimate, logloss, acc

pred_lgb = optimized_lgb(X_lgb_train_pre, y_lgb_train_pre, X_lgb_test_pre, y_lgb_test_pre, input_valid_lgb)
print(pred_lgb)

solution_lgb = pd.DataFrame(data=pred_lgb[0], columns=['preds'])
solution_lgb.to_csv('solutions/Classification_HauLamFong_lightgbm.txt')


[LightGBM] [Info] Number of positive: 97199, number of negative: 32801
You can set `force_col_wise=true` to remove the overhead.

'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.



[LightGBM] [Info] Total Bins 4136
[LightGBM] [Info] Number of data points in the train set: 130000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.747685 -> initscore=1.086301
[LightGBM] [Info] Start training from score 1.086301
[1]	valid_0's binary_logloss: 0.349103
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.274134
[3]	valid_0's binary_logloss: 0.230989
[4]	valid_0's binary_logloss: 0.204308
[5]	valid_0's binary_logloss: 0.18678
[6]	valid_0's binary_logloss: 0.175738
[7]	valid_0's binary_logloss: 0.167835
[8]	valid_0's binary_logloss: 0.162197
[9]	valid_0's binary_logloss: 0.158373
[10]	valid_0's binary_logloss: 0.155465
[11]	valid_0's binary_logloss: 0.15351
[12]	valid_0's binary_logloss: 0.151855
[13]	valid_0's binary_logloss: 0.150858
[14]	valid_0's binary_logloss: 0.150316
[15]	valid_0's binary_logloss: 0.149929
[16]	valid_0's binary_logloss: 0.149129
[17]	valid_0's binary_logloss: 0.14876
[18]	vali