In [94]:
import pandas as pd

import numpy as np

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from xgboost import plot_importance

from matplotlib import pyplot

from collections import Counter

import os

In [60]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [61]:
pahtname=r'D:\PythonJupyter\MyDev\FinPythonLab\MarkLable-S50F\datasets\S50F15M_LS\ML-Long_S50M15_07To19-Train.csv'
filepath= os.path.abspath(pahtname)
dataset =pd.read_csv(filepath,index_col="datetime", parse_dates=['datetime'],dayfirst=True)
dataset.drop(columns=['open','high','low','close'],inplace=True)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 68292 entries, 2007-01-03 09:45:00 to 2019-12-30 17:00:00
Data columns (total 18 columns):
indy_ma-550               68292 non-null float64
indy_ma-1100              68292 non-null float64
indy_hh-550               68292 non-null float64
indy_ll-550               68292 non-null float64
indy_mid-550              68292 non-null float64
indy_hh2-1100             68292 non-null float64
indy_ll2-1100             68292 non-null float64
indy_mid2-1100            68292 non-null float64
indy_macd110-440          68292 non-null float64
indy_signal110-440-110    68292 non-null float64
indy_hist_macd110-440     68292 non-null float64
indy_rsi25-ma20           68292 non-null float64
indy_6ATRTrail_DC-110     68292 non-null float64
cate_3trend-550_ma110     68292 non-null int64
cate_2trend-1100_ma220    68292 non-null int64
cate_rannkHL1100-ma66     68292 non-null int64
cate_CombineTrend         68292 non-null int64
LongSignal                68292

In [62]:
df=dataset['01-2007':'12-2015']

In [63]:
labelname=df.columns[len(df.columns)-1]

X_df=df.drop(labelname,1)
Y_df=df[labelname]

#Use either to_numpy or .values
X=X_df.to_numpy()
y=Y_df.to_numpy()


In [64]:
# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 29437, 1: 17098})


In [68]:
model = XGBClassifier()
#model = XGBClassifier(max_depth=8,learning_rate=0.05,n_estimators=250)

In [69]:
# define evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1992)
# evaluate model
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
# summarize performance
print('Mean Accuracy %.5f' % np.mean(scores))

Mean Accuracy 0.97660


In [70]:
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % np.mean(scores))

Mean ROC AUC: 0.99762


# Weighted XGBoost for Class Imbalance

Although the XGBoost algorithm performs well for a wide range of challenging problems, it offers a large number of hyperparameters, many of which require tuning in order to get the most out of the algorithm on a given dataset.

The implementation provides a hyperparameter designed to tune the behavior of the algorithm for imbalanced classification problems; this is the scale_pos_weight hyperparameter.

By default, the scale_pos_weight hyperparameter is set to the value of 1.0 and has the effect of weighing the balance of positive examples, relative to negative examples when boosting decision trees. For an imbalanced binary classification dataset, the negative class refers to the majority class (class 0) and the positive class refers to the minority class (class 1).

XGBoost is trained to minimize a loss function and the “gradient” in gradient boosting refers to the steepness of this loss function, e.g. the amount of error. A small gradient means a small error and, in turn, a small change to the model to correct the error. A large error gradient during training in turn results in a large correction.


Small Gradient: Small error or correction to the model.

Large Gradient: Large error or correction to the model.

Gradients are used as the basis for fitting subsequent trees added to boost or correct errors made by the existing state of the ensemble of decision trees.

The scale_pos_weight value is used to scale the gradient for the positive class.

This has the effect of scaling errors made by the model during training on the positive class and encourages the model to over-correct them. In turn, this can help the model achieve better performance when making predictions on the positive class. Pushed too far, it may result in the model overfitting the positive class at the cost of worse performance on the negative class or both classes.

As such, the scale_pos_weight can be used to train a class-weighted or cost-sensitive version of XGBoost for imbalanced classification.

A sensible default value to set for the scale_pos_weight hyperparameter is the inverse of the class distribution. For example, for a dataset with a 1 to 100 ratio for examples in the minority to majority classes, the scale_pos_weight can be set to 100. This will give classification errors made by the model on the minority class (positive class) 100 times more impact, and in turn, 100 times more correction than errors made on the majority class.

# scale_pos_weight = total_negative_examples / total_positive_examples

In [91]:
model = XGBClassifier(scale_pos_weight=100)
#model = XGBClassifier(max_depth=8,learning_rate=0.05,n_estimators=250)

In [92]:
# count examples in each class
counter = Counter(y)
# estimate scale_pos_weight value
estimate = counter[0] / counter[1]
print('Estimate: %.3f' % estimate)

Estimate: 1.722


In [93]:
# define evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1992)
# evaluate model
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.5f' % np.mean(scores))

Mean ROC AUC: 0.85884


In [None]:
model = XGBClassifier()
#1, 10, 25, 50, 75, 99, 100, 1000]

#weights = [1, 10, 50,  100, 1000]
#paramGrid_scale_pos_weight = dict(scale_pos_weight=weights)

paramGrid_scale_pos_weight = {'scale_pos_weight':[1, 10, 50,  100, 1000]}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1992)
grid_result=GridSearchCV(estimator=model,param_grid=paramGrid_scale_pos_weight ,n_jobs=1,cv=cv,scoring='roc_auc')
grid_result = grid.fit(X, y)

In [109]:

# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'