-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcase_higgs.py
135 lines (122 loc) · 5.8 KB
/
case_higgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
https://archive.ics.uci.edu/ml/datasets/HIGGS
This is a classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.
The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic properties measured by the particle detectors in the accelerator. The last seven features are functions of the first 21 features; these are high-level features derived by physicists to help discriminate between the two classes. There is an interest in using deep learning methods to obviate the need for physicists to manually develop such features. Benchmark results using Bayesian Decision Trees from a standard physics package and 5-layer neural networks are presented in the original paper. The last 500,000 examples are used as a test set.
https://github.com/Laurae2/boosting_tree_benchmarks/tree/master/data
https://github.com/guolinke/boosting_tree_benchmarks/tree/master/data
https://blog.bigml.com/2017/09/28/case-study-finding-higgs-bosons-with-deepnets/
5/19/2019 需要确定是regression 或 binary classification
8/23/2019 subsample subfeature 似乎都没用(2000000测试)
lesome_rows=2000000 iter=2000 auc=0.83775(1,1) auc=0.83847(0.8,1);auc=0.83618(0.8,0.5)
'''
import lightgbm as lgb
import time
import sys
import os
import gc
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
import pickle
from litemort import *
#from LiteMORT_EDA import *
isMORT = len(sys.argv)>1 and sys.argv[1] == "mort"
#isMORT = True
model_type = 'mort' if isMORT else 'lgb'
#some_rows= 200000
#some_rows= 2000000
some_rows= 10500000
nTotal = 11000000
nLastForTest = 500000 #The last 500,000 examples are used as a test set.
#some_rows=None
def read_higgs_data(path):
pkl_path = 'F:/Datasets/HIGGS_/higgs_data_{}.pickle'.format(some_rows)
if os.path.isfile(pkl_path):
print("====== Load pickle @{} ......".format(pkl_path))
with open(pkl_path, "rb") as fp:
[X, y, X_test,y_test] = pickle.load(fp)
else:
assert(some_rows<=nTotal-nLastForTest)
print("====== Read last {} examples as training set ......".format(some_rows))
df = pd.read_csv(path, nrows=some_rows,header=None)
y=pd.Series(df.iloc[:,0])
X=df.iloc[:,1:]
print("====== Read last {} examples as testing set ......".format(nLastForTest))
df = pd.read_csv(path, skiprows = nTotal-nLastForTest,nrows=nLastForTest, header=None)
y_test = pd.Series(df.iloc[:, 0])
X_test = df.iloc[:,1:]
del df
gc.collect()
print("====== Save pickle @{} ......".format(pkl_path))
with open(pkl_path, "wb") as fp: # Pickling
pickle.dump([X, y, X_test,y_test], fp)
print("====== read_higgs_data X={}, y={}, X_test={} ...... OK".format(X.shape, y.shape, X_test.shape))
return X,y,X_test
X,y,X_test = read_higgs_data("F:/Datasets/HIGGS_/HIGGS.csv")
#X = Unique_Expand(X)
#X_test = Unique_Expand(X_test)
num_rounds = 10001
params = {
"objective": "binary",
"metric": "auc", #"binary_logloss"
"adaptive":'weight',
'max_bin': 256,
'num_leaves': 64,
'learning_rate': 0.1,
'tree_learner': 'serial',
'task': 'train',
'is_training_metric': 'false',
'min_data_in_leaf': 512,
#'min_sum_hessian_in_leaf': 100,
#'bagging_fraction': 1,#0.2,
'subsample': 1, 'bagging_freq': 1,
'feature_fraction': 1,
#'ndcg_eval_at': [1, 3, 5, 10],
#'sparse_threshold': 1.0,
'n_estimators':num_rounds,
'early_stopping_rounds': 500,
'verbose':667,
#'device': 'cpu'
#'device': 'gpu',
#'gpu_platform_id': 0,
#'gpu_device_id': 0
}
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
t0 = time.time()
if type(X) == np.ndarray:
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
else:
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
if False:
mean = y_train.mean();
d_train = pd.concat([y_train, X_train], ignore_index=True, axis=1)
print("X_train={}, y_train={} d_train={}".format(X_train.shape, y_train.shape, d_train.shape))
np.savetxt("D:/LightGBM-master/examples/regression/geo_test.csv", d_train, delimiter='\t')
if model_type == 'mort':
model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
#y_pred_valid = model.predict(X_valid)
#y_pred = model.predict(X_test)
if model_type == 'lgb':
model = lgb.LGBMRegressor(**params, n_jobs=-1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc',verbose=5)
model.booster_.save_model('geo_test_.model')
#y_pred_valid = model.predict(X_valid)
#y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
break
input("loss is {} time={:.3g} model={}...".format(0,time.time()-t0,model_type))
sys.exit(-1)
t0 = time.time()
gbm = lgb.train(params, train_set=dtrain, num_boost_round=10,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None,
verbose_eval=True,
keep_training_booster=False, callbacks=None)
t1 = time.time()
print('cpu version elapse time: {}'.format(t1 - t0))