-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgb_hyperop.py
118 lines (103 loc) · 4.1 KB
/
xgb_hyperop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, log_loss
# As we mentioned Signal Dataframe is available at ta_mean.py https://github.com/tzelalouzeir/XGBoost_Indicators
# Merge with this code or at ta_mean.py save Signals column to csv
# Available to modification, choose best parameters for your data
# Remove NaN rows
df.dropna(inplace=True)
# Label encode the target column
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['Signal'])
# Prepare data
X = df.drop(['Signal'], axis=1)
y = y_encoded
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Objective function for hyperopt optimization
def objective(space):
clf = XGBClassifier(
n_estimators = int(space['n_estimators']),
max_depth = int(space['max_depth']),
learning_rate = space['learning_rate'],
gamma = space['gamma'],
min_child_weight = int(space['min_child_weight']),
subsample = space['subsample'],
colsample_bytree = space['colsample_bytree'],
use_label_encoder=False,
eval_metric='logloss'
)
acc = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
return {'loss': -acc, 'status': STATUS_OK }
# Hyperparameter space
space = {
'n_estimators': hp.quniform('n_estimators', 50, 1000, 1),
'max_depth': hp.quniform('max_depth', 3, 14, 1),
'learning_rate': hp.loguniform('learning_rate', -5, 0),
'gamma': hp.uniform('gamma', 0, 1),
'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
'subsample': hp.uniform('subsample', 0.1, 1),
'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1)
}
# Run optimization
trials = Trials()
best_hyperparams = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=50,
trials=trials)
# Print the best hyperparameters
print("\nBest hyperparameters:")
for key, value in best_hyperparams.items():
if key in ['n_estimators', 'max_depth', 'min_child_weight']:
print(f"{key}: {int(value)}")
else:
print(f"{key}: {value}")
# Use best hyperparameters to fit the model
best_clf = XGBClassifier(
n_estimators=int(best_hyperparams['n_estimators']),
max_depth=int(best_hyperparams['max_depth']),
learning_rate=best_hyperparams['learning_rate'],
gamma=best_hyperparams['gamma'],
min_child_weight=int(best_hyperparams['min_child_weight']),
subsample=best_hyperparams['subsample'],
colsample_bytree=best_hyperparams['colsample_bytree'],
use_label_encoder=False,
eval_metric='logloss',
random_state=42
)
best_clf.fit(X_train, y_train)
# # Get feature importances
importances = best_clf.feature_importances_
feature_names = X.columns
# Get and plot feature importances
importances = best_clf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
sorted_names = [feature_names[i] for i in sorted_indices]
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[sorted_indices], align="center")
plt.xticks(range(X.shape[1]), sorted_names, rotation=90)
plt.show()
## performance
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
matrix = confusion_matrix(y_test, y_pred)
probabilities = best_clf.predict_proba(X_test)[:, 1] # get the probability of the positive class
auc = roc_auc_score(y_test, probabilities)
print(matrix)
print(report)
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC Score: {auc:.2f}")
fpr, tpr, thresholds = roc_curve(y_test, probabilities)
plt.plot(fpr, tpr, label=f"ROC Curve (AUC={auc:.2f})")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()