In [346]:
import h5py
import pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import shap
import lightgbm as lgb 
import optuna
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback
from optuna.pruners import MedianPruner
import xgboost
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow import keras
from sklearn.metrics import mean_absolute_error, r2_score
from tensorflow.keras.metrics import Accuracy, BinaryCrossentropy
from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif, mutual_info_regression, mutual_info_classif, VarianceThreshold
from sklearn import preprocessing

In [347]:
# read data
def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[-1]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('train')
test  = load_data('test')

print (f'Shape of training data set: {train.shape}')
print (f'Shape of test data set: {test.shape}')

all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]


X = train[all_variables]
sc_X = preprocessing.StandardScaler()
X = sc_X.fit_transform(X)
y = train['Truth']

# X = pd.DataFrame(X, columns=all_variables)

print (f'Shape of X: {X.shape}')
print (f'Shape of y: {y.shape}')

print(sum(y==0))
print(sum(y==1))

Shape of training data set: (162500, 166)
Shape of test data set: (160651, 164)
Shape of X: (162500, 160)
Shape of y: (162500,)
41005
121495


In [348]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [349]:
# shap value xgbooster

def shap_xgbooster():
    model = xgboost.XGBRegressor().fit(X, y)
    explainer = shap.Explainer(model)
    shap_values = explainer(X)

    feature_names = shap_values.feature_names
    shap_df = pd.DataFrame(shap_values.values, columns=feature_names)
    vals = np.abs(shap_df.values).mean(0)
    shap_importance = pd.DataFrame(list(zip(feature_names, vals)), columns=['col_name', 'feature_importance_vals'])
    shap_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True)
    shap.plots.bar(shap_values)

    return shap_importance.head(20)

shape_variables = ['p_Rhad', 'p_Rphi', 'p_Reta', 'p_sigmad0', 'p_deltaEta1', 'p_ptconecoreTrackPtrCorrection', 'p_deltaPhiRescaled2', 'p_d0', 'p_numberOfInnermostPixelHits', 'p_ambiguityType',
                    'p_rawPhiCluster','p_phiCalo', 'p_ethad', 'p_EptRatio', 'p_Rhad1', 'p_E7x11_Lr3', 'p_ehad1', 'p_Eratio', 'p_deltaPhi2', 'p_nTracks']

shap_variables = ['p_Rhad','p_Reta','p_deltaEta1', 'p_sigmad0',
	'p_Rphi',	
	'p_ambiguityType',	
	'p_ethad',
	'p_numberOfInnermostPixelHits',
	'p_deltaPhiRescaled2',
	'p_ptconecoreTrackPtrCorrection',
	'p_d0',
	'p_Rhad1',
	'p_d0Sig',
	'p_nTracks',
	'p_deltaPhiFromLastMeasurement',
	'p_E7x11_Lr3',
	'p_deltaPhi2',
	'p_numberOfPixelHits',	
	'p_EptRatio',	
	'p_dPOverP']


In [350]:
# feature ranking lgb

def feature_importance_lgb():
    gbm = lgb.LGBMRegressor()
    gbm.fit(X_train, y_train)
    gbm.booster_.feature_importance()

    feature_imp_ = pd.DataFrame({'cols':X_train.columns, 'feature_imp':gbm.feature_importances_})
    feature_imp_.loc[feature_imp_.feature_imp > 0].sort_values(by=['feature_imp'], ascending=False)

    return feature_imp_.loc[feature_imp_.feature_imp > 0].sort_values(by=['feature_imp'], ascending=False).head(20)

feature_importance_lgb()


Unnamed: 0,cols,feature_imp
15,p_sigmad0,150
53,p_deltaPhiRescaled2,130
52,p_deltaEta1,124
45,p_Reta,123
14,p_d0,123
129,p_ptconecoreTrackPtrCorrection,120
16,p_d0Sig,106
50,p_Rhad,88
46,p_Rphi,85
90,p_deltaPhiFromLastMeasurement,80


In [371]:
def get_feature_importance():
    model = SelectKBest(mutual_info_classif, k=20)#选择k个最佳特征
    X_new = model.fit_transform(X, y)
    #feature_data是特征数据，label_data是标签数据，该函数可以选择出k个特征 
 
    print("model shape: ",X_new.shape)
 
    scores = model.scores_
    print('model scores:', scores)  # 得分越高，特征越重要
 
    p_values = model.pvalues_
    print('model p-values', p_values)  # p-values 越小，置信度越高，特征越重要
 
    # 按重要性排序，选出最重要的 k 个
    indices = np.argsort(scores)[::-1]
    k_best_features = list(X.columns.values[indices[0:20]])
 
    print('k best features are: ',k_best_features)
    
    return k_best_features

get_feature_importance()


# def selection_features(X_train, y_train, X_test):
#     select = SelectKBest(score_func=f_regression, k=20)
#     select.fit(X_train, y_train)
    
#     return select

# select = selection_features(X_train, y_train, X_test)
# X = pd.DataFrame(X)
# names = X.columns.values[select.get_support()]
# scores = select.scores_[select.get_support()]
# names_scores = list(zip(names, scores))
# ns_df = pd.DataFrame(data = names_scores, columns=['Feature_names', 'Feature_scores'])
# #Sort the dataframe for better visualization
# ns_df_sorted = ns_df.sort_values(['Feature_names', 'Feature_scores'], ascending = [False, True])
# print(ns_df_sorted)

model shape:  (162500, 20)
model scores: [0.00000000e+00 2.00523377e-03 6.00631961e-04 0.00000000e+00
 0.00000000e+00 2.57362420e-04 1.21159116e-03 6.68945328e-02
 8.66664990e-02 0.00000000e+00 4.63271514e-04 1.17064972e-02
 7.45827949e-02 2.25841968e-04 6.46400995e-02 6.29030490e-02
 4.78289453e-02 8.40571638e-02 3.06450910e-02 2.51914230e-05
 4.29705808e-04 3.49969640e-04 1.78873260e-03 3.70970184e-04
 3.79191462e-04 1.83444385e-03 1.33911324e-03 2.14129361e-02
 2.52082943e-03 1.13266801e-01 1.27613345e-03 9.30126864e-03
 3.75154596e-04 2.99390611e-03 2.35652111e-03 4.90189428e-04
 1.18442724e-03 1.11187978e-03 1.41327540e-03 1.81302056e-03
 7.10784781e-05 0.00000000e+00 8.86168347e-03 1.91565655e-03
 1.45343261e-01 2.21468850e-01 1.70567877e-01 1.77671862e-01
 1.08817552e-02 1.14928325e-01 2.44758413e-01 2.35569220e-01
 2.31145431e-01 1.66139071e-01 4.17810587e-02 3.64978183e-03
 7.13444156e-02 4.97281119e-02 6.55839738e-03 1.43972085e-03
 1.63857177e-03 0.00000000e+00 7.31055953e-0

['p_ethad',
 'p_Rhad',
 'p_Rhad1',
 'p_ehad1',
 'p_ethad1',
 'p_deltaEta1',
 'p_Reta',
 'p_deltaEta2',
 'p_Eratio',
 'p_E7x11_Lr3',
 'p_Rphi',
 'p_E7x7_Lr3',
 'p_deltaPhiRescaled2',
 'p_E5x7_Lr3',
 'p_f3core',
 'p_e2tsts1',
 'p_weta2',
 'p_e2ts1',
 'p_E3x5_Lr3',
 'p_DeltaE']

In [369]:
# feature ranking by xgboost
shap_variables = ['p_Rhad', 'p_Rphi', 'p_Reta', 'p_sigmad0', 'p_deltaEta1', 'p_ptconecoreTrackPtrCorrection', 'p_deltaPhiRescaled2', 'p_d0', 'p_numberOfInnermostPixelHits', 'p_ambiguityType',
                    'p_rawPhiCluster','p_phiCalo', 'p_ethad', 'p_EptRatio', 'p_Rhad1', 'p_E7x11_Lr3', 'p_ehad1', 'p_Eratio', 'p_deltaPhi2', 'p_nTracks']

# split data with shap
X_shap = train[shap_variables]
sc_X_shap = preprocessing.StandardScaler()
X_shap_pre = sc_X_shap.fit_transform(X_shap)

y_shap = train['Truth']
X_shap_train, X_shap_test, y_shap_train, y_shap_test = train_test_split(X_shap, y_shap, test_size=0.2, random_state=12)
X_shap_train_pre, X_shap_test_pre, y_shap_train_pre, y_shap_test_pre = train_test_split(X_shap_pre, y_shap, test_size=0.2, random_state=12)

# feature ranking by lgb

# non-preprocess
lgb_variables = ['p_sigmad0', 'p_deltaPhiRescaled2', 'p_deltaEta1', 'p_Reta', 'p_d0', 'p_ptconecoreTrackPtrCorrection',
'p_d0Sig', 'p_Rhad', 'p_Rphi', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhi2', 'p_EptRatio', 'p_dPOverP', 'p_numberOfSCTHits',
 'p_numberOfPixelHits', 'p_pt_track', 'p_nTracks', 'p_ethad', 'p_qOverP','p_ambiguityType']


X_lgb = train[lgb_variables]
sc_X_lgb = preprocessing.StandardScaler()
X_lgb_pre = sc_X_lgb.fit_transform(X_lgb)

y_lgb = train['Truth']
X_lgb_train, X_lgb_test, y_lgb_train, y_lgb_test = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=12)
X_lgb_train_pre, X_lgb_test_pre, y_lgb_train_pre, y_lgb_test_pre = train_test_split(X_lgb_pre, y_lgb, test_size=0.2, random_state=12)

# feature ranking by kbest
kbest_variables = ['p_ethad', 'p_Rhad', 'p_Rhad1', 'p_ehad1', 'p_ethad1', 'p_deltaEta1', 'p_Reta', 'p_deltaEta2', 'p_Eratio', 'p_E7x11_Lr3', 'p_Rphi', 'p_E7x7_Lr3', 
'p_deltaPhiRescaled2', 'p_E5x7_Lr3', 'p_f3core', 'p_e2tsts1', 'p_e2ts1', 'p_weta2', 'p_E3x5_Lr3', 'p_DeltaE']

X_kbest = train[kbest_variables]
sc_X_kbest = preprocessing.StandardScaler()
X_kbest_pre = sc_X_lgb.fit_transform(X_kbest)

y_kbest = train['Truth']
X_kbest_train, X_kbest_test, y_kbest_train, y_kbest_test = train_test_split(X_kbest, y_kbest, test_size=0.2, random_state=12)
X_kbest_train_pre, X_kbest_test_pre, y_kbest_train_pre, y_kbest_test_pre = train_test_split(X_kbest_pre, y_kbest, test_size=0.2, random_state=12)

In [353]:
import optuna.integration.lightgbm as oplgb
import optuna
from sklearn.model_selection import RepeatedKFold


X_lgb_train, X_lgb_test, y_lgb_train, y_lgb_test = train_test_split(X_lgb, y_lgb, test_size=0.2, random_state=42)

def objective_lgb():
    
    train_data = lgb.Dataset(X_lgb_train, label=y_lgb_train)
    valid_data = lgb.Dataset(X_lgb_test, label=y_lgb_test)
    
    rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)
    params = {
            'objective': 'regression',
            'metric': 'binary_logloss',
            'boosting_type': 'dart',
            'max_depth': 10,
            'learning_rate': 0.2759844445088989,
            'feature_fraction': 0.8,
            'num_leaves': 360,
            'bagging_freq': 1,
            'bagging_fraction': 0.8,
            'reg_alpha': 8.200000000000001,
            'reg_lambda': 103,
            'verbose':-1,
            'force_col_wise': True
    }

    study_tuner = optuna.create_study(
    direction="minimize")

    tuner = oplgb.LightGBMTunerCV(params, train_data, study=study_tuner, num_boost_round=100, folds=rkf, 
                                early_stopping_rounds=200, seed=42)

    tuner.run()

    return tuner.best_params

# {'objective': 'regression',
#  'metric': 'binary_logloss',
#  'boosting_type': 'dart',
#  'max_depth': 10,
#  'learning_rate': 0.2759844445088989,
#  'feature_fraction': 0.9520000000000001,
#  'num_leaves': 360,
#  'bagging_freq': 1,
#  'bagging_fraction': 0.9993807771263824,
#  'verbose': -1,
#  'force_col_wise': True,
#  'feature_pre_filter': False,
#  'lambda_l1': 8.200000000000001,
#  'lambda_l2': 103,
#  'min_child_samples': 20}



In [354]:
def optimized_lgb(X_train, X_test, y_train, y_test):
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    params = {'objective': 'regression',
            'metric': 'binary_logloss',
            'boosting_type': 'dart',
            'max_depth': 10,
            'learning_rate': 0.2759844445088989,
            'feature_fraction': 0.8,
            'num_leaves': 360,
            'bagging_freq': 1,
            'bagging_fraction': 0.8,
            'reg_alpha': 8.200000000000001,
            'reg_lambda': 103,
            'verbose':-1,
            'force_col_wise': True}

    lgb_clf = lgb.train(params, train_set=train_data, num_boost_round=1000)
    y_pred = np.around(lgb_clf.predict(X.values))
    acc = accuracy_score(y_pred, y)
    print(f"Train accuracy: {acc*100.0:.2f}%")

    return lgb_clf.predict(X.values)

In [355]:
# optimize hyperparameters of lgb
# train_data = lgb.Dataset(X_train, label=y_train)
# valid_data = lgb.Dataset(X_test, label=y_test)

# def objective_lgb(trial):
    
#     train_data = lgb.Dataset(X_train, label=y_train)
#     valid_data = lgb.Dataset(X_test, label=y_test)

#     boosting_types = ["gbdt", "rf", "dart"]
#     boosting_type = trial.suggest_categorical("boosting_type", boosting_types)

#     params = {
#         'objective': 'regression',
#         'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "rf", "dart"]),
#         'max_depth': trial.suggest_int('max_depth', 2, 12),
#         'metric': {'l2', 'auc'},
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.95, step=0.1),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20),
#         'bagging_freq': trial.suggest_categorical('bagging_freq', [1]),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.95, step=0.1),
#         'reg_alpha': trial.suggest_float("reg_alpha", 0, 100, step=0.1),
#         'reg_lambda': trial.suggest_int("reg_lambda", 0, 1000, step=1),
#         'verbosity': -1,
#     }

#     N_iterations_max = 10000
#     early_stopping_rounds = 50

#     if boosting_type == "dart":
#         N_iterations_max = 100
#         early_stopping_rounds = None

#     cv_res = lgb.cv(
#         params,
#         train_data,
#         num_boost_round=N_iterations_max,
#         early_stopping_rounds=early_stopping_rounds,
#         verbose_eval=False,
#         seed=42,
#         callbacks=[LightGBMPruningCallback(trial, "binary_logloss")],
#     )

#     num_boost_round = len(cv_res["auc-mean"])
#     trial.set_user_attr("num_boost_round", num_boost_round)

#     return cv_res["auc-mean"][-1]



# study = optuna.create_study(
#     direction="maximize",
#     sampler=TPESampler(seed=42),
#     pruner=MedianPruner(n_warmup_steps=50),
# )

# study.optimize(objective_lgb, n_trials=100, show_progress_bar=True)

# study.best_trial.params

In [356]:
# optimized lgb

def optimized_lgb(X_train, X_test, y_train, y_test):
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    params = {'objective': 'regression',
            'metric': 'binary_logloss',
            'boosting_type': 'dart',
            'max_depth': 10,
            'learning_rate': 0.2759844445088989,
            'feature_fraction': 0.8,
            'num_leaves': 360,
            'bagging_freq': 1,
            'bagging_fraction': 0.8,
            'reg_alpha': 8.200000000000001,
            'reg_lambda': 103,
            'verbose':-1,
            'force_col_wise': True}

    lgb_clf = lgb.train(params, train_set=train_data, num_boost_round=1000)
    y_pred = np.around(lgb_clf.predict(X.values))
    acc = accuracy_score(y_pred, y)
    print(f"Train accuracy: {acc*100.0:.2f}%")

    return lgb_clf.predict(X.values), acc
    

{'boosting_type': 'dart',
 'max_depth': 10,
 'learning_rate': 0.2759844445088989,
 'feature_fraction': 0.8,
 'num_leaves': 360,
 'bagging_freq': 1,
 'bagging_fraction': 0.8,
 'reg_alpha': 8.200000000000001,
 'reg_lambda': 103}

In [357]:
# Neural Network

import kerastuner as kt

# class ClearTrainingOutput(tf.keras.callbacks.Callback):
#     def on_train_end(*args, **kwargs):
#         print("训练完成，调用回调方法")

# def model_builder(hp):
#     model = Sequential()
#     # Tune the number of units in the first Dense layer
#     # Choose an optimal value between 32-512
#     hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
#     model.add(Dense(units=hp_units, activation='relu'))
#     model.add(Dense(units=hp_units, activation='relu'))
#     model.add(Dense(units=1, activation='relu'))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#                   metrics=['accuracy'])  # accuracy，用于判断模型效果的函数
#     return model
    
# tuner = kt.Hyperband(model_builder,
#                     objective='val_accuracy',  # 优化的目标，验证集accuracy
#                     max_epochs=10,  # 最大迭代次数
#                     factor=3)

# tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test),
#                  callbacks=[ClearTrainingOutput()])

# tuner.get_best_hyperparameters(num_trials=1)[0]


In [358]:
%reload_ext tensorboard
from tensorboard.plugins.hparams import api as hp


In [359]:
# hyperparameter tuning for neural network

# HP_NUM_UNITS1 = hp.HParam('num_units1', hp.Discrete([16, 32, 64, 128]))
# HP_NUM_UNITS2 = hp.HParam('num_units2', hp.Discrete([16, 32, 64, 128]))
# HP_NUM_UNITS3 = hp.HParam('num_units3', hp.Discrete([16, 32, 64, 128]))
# HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.2))

# METRIC_ACCURACY = 'accuracy'

# with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
#   hp.hparams_config(
#     hparams=[HP_NUM_UNITS1, HP_NUM_UNITS2, HP_NUM_UNITS3],
#     metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
#   )

# def train_test_model(hparams):
#   model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS1], activation=tf.nn.relu),
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS2], activation=tf.nn.relu),
#     tf.keras.layers.Dense(hparams[HP_NUM_UNITS3], activation=tf.nn.relu),
#     tf.keras.layers.Dense(1),
#   ])
#   model.compile(
#       optimizer='adam',
#       loss='mae',
#       metrics=['accuracy'],
#   )

#   model.fit(X_train, y_train, epochs=5) # Run with 1 epoch to speed things up for demo purposes
#   _, accuracy = model.evaluate(X_test, y_test)
#   return accuracy

# def run(run_dir, hparams):
#   with tf.summary.create_file_writer(run_dir).as_default():
#     hp.hparams(hparams)  # record the values used in this trial
#     accuracy = train_test_model(hparams)
#     tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

# session_num = 0

# for num_units1 in HP_NUM_UNITS1.domain.values:
#     for num_units2 in HP_NUM_UNITS2.domain.values:
#       for num_units3 in HP_NUM_UNITS3.domain.values:
#             hparams = {
#                 HP_NUM_UNITS1: num_units1,
#                 HP_NUM_UNITS2: num_units2,
#                 HP_NUM_UNITS3: num_units3
#             }
#             run_name = "run-%d" % session_num
#             print('--- Starting trial: %s' % run_name)
#             print({h.name: hparams[h] for h in hparams})
#             run('logs/hparam_tuning/' + run_name, hparams)
#             session_num += 1



In [360]:
# %tensorboard --logdir logs/hparam_tuning

In [370]:
def optimized_nn(X_train1, y_train1, X_test1, y_test1):
    model = Sequential([
        Dense(9,activation='relu',name='input_layer'),
        Dense(24,activation='relu',name='hidden_layer1'),
        Dense(12,activation='relu',name='hidden_layer2'),
        Dense(1, name='output')])

   
    model.compile(optimizer='adam',
                loss='BinaryCrossentropy',
                metrics='Accuracy')

    print('--------- TRAINING ---------')
    history = model.fit(x=X_train1, y=y_train1, validation_data=(X_test1, y_test1), epochs = 5)  
    score = model.evaluate(X_test1,  y_test1, verbose=2)
    y_pred = model.predict(X_test1)
    # y_pred = sc_y.inverse_transform(y_pred)

    return score, y_pred

optimized_nn(X_kbest_train_pre, y_kbest_train_pre, X_kbest_test_pre, y_kbest_test_pre)

--------- TRAINING ---------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
1016/1016 - 0s - loss: 0.2396 - Accuracy: 0.9199 - 427ms/epoch - 420us/step


([0.2395523339509964, 0.9198769330978394],
 array([[0.71966153],
        [0.92709726],
        [0.8898367 ],
        ...,
        [0.93942344],
        [0.7674445 ],
        [0.30673143]], dtype=float32))