In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from quant_free.dataset.us_equity_load import *
from quant_free.utils.us_equity_utils import *
from quant_free.factor.price import *

symbol = 'TSM'
# symbol = 'AAPL'
# symbol = 'INTC'

factor_name = 'Trend.csv'
like = 'trend'

# factor_name = 'Alpha101.csv'
# like = 'alpha'

thr = 0.00
forward_period = 10
start_date = get_json_config_value("training_start_date")
end_date = get_json_config_value("training_end_date")
market = 'us'

factor = equity_tradedata_load_bt_dates(market, symbols = [symbol], start_date = start_date,
                                end_date = end_date, column_option = "all", file_name = factor_name)[symbol]
factor = factor.replace({True: 1, False: 0})
factor = factor.loc[:, (factor != 0).any(axis=0)]
# trnsX = factor.loc[:, ['alpha1', 'alpha2', 'alpha3', 'alpha4', 'alpha5', 'alpha6', 'alpha7', 'alpha8', 'alpha9', 'alpha10', 'alpha11', 'alpha12', 'alpha13', 'alpha14', 'alpha15']]
trnsX = factor.filter(like=like).astype(np.float64)
# print(trnsX.head(5))


# price_ratio = PriceRatio(start_date, end_date, symbol = symbol, column_option = 'close', dir_option = 'xq')
# y_data = price_ratio.price_ratio(periods = periods)
y_data = factor.loc[:, f'ret_forward_{forward_period}']
cont = pd.DataFrame(y_data.map(lambda x: 1 if x > thr else 0 if x < -thr else 0))
cont = pd.concat([cont, y_data], axis = 1)
cont.columns = ['bin', f'price_ratio_{forward_period}']
cont['t1'] = cont.index

print(cont.tail(10))


            bin  price_ratio_10         t1
2020-09-23    1        0.109306 2020-09-23
2020-09-24    1        0.113493 2020-09-24
2020-09-25    1        0.118233 2020-09-25
2020-09-28    1        0.130722 2020-09-28
2020-09-29    1        0.107420 2020-09-29
2020-09-30    1        0.088820 2020-09-30
2020-10-01    1        0.064789 2020-10-01
2020-10-02    1        0.070477 2020-10-02
2020-10-05    1        0.035099 2020-10-05
2020-10-06    1        0.060002 2020-10-06


In [12]:
from sklearn.ensemble import RandomForestClassifier
from quant_free.finml.feature_importance import *

forest = RandomForestClassifier(
    criterion = 'entropy',
    class_weight = 'balanced_subsample',
    min_weight_fraction_leaf = 0.0,
    random_state = 42,
    n_estimators = 100,
    max_features = 1,
    oob_score = True,
    n_jobs = 1
)

fit = forest.fit(X = trnsX, y = cont['bin'])
oob_score = fit.oob_score_
print(f"oob_score {oob_score}")

from quant_free.finml.cross_validation.cross_validation import PurgedKFold, cross_val_score
from quant_free.finml.feature_importance.importance import *
cv_gen = PurgedKFold(
    n_splits = 20, 
    samples_info_sets = cont['t1']
)

oos_score = cross_val_score(
    forest, # base classifier
    trnsX, # train features
    cont['bin'], # train labels
    cv_gen = cv_gen, # purged k fold cross validation class
    scoring = accuracy_score # optimizing to accuracy score
).mean()
print(f"oos_score {oos_score}")

oob_score 0.7546485260770975
oos_score 0.5573054873054873


In [13]:
from xgboost import XGBClassifier
from quant_free.finml.feature_importance import *
from quant_free.finml.cross_validation.cross_validation import PurgedKFold, cross_val_score

cv_gen = PurgedKFold(
    n_splits = 20, 
    samples_info_sets = cont['t1']
)

forest = XGBClassifier(
                        # max_depth=3,                  # Maximum tree depth for base learners.
                        learning_rate=0.1,            # Boosting learning rate (xgb's "eta")
                        n_estimators=100,             # Number of boosted trees to fit.
                        # silent=True,                  # Whether to print messages while running
                        objective='binary:logistic',  # Task and objective or custom objective function
                        booster='gbtree',             # Select booster: gbtree, gblinear or dart
#                         tree_method='gpu_hist',
                        n_jobs=-1,                    # Number of parallel threads
                        gamma=0,                      # Min loss reduction for further splits
                        min_child_weight=1,           # Min sum of sample weight(hessian) needed
                        max_delta_step=0,             # Max delta step for each tree's weight estimation
                        subsample=1,                  # Subsample ratio of training samples
                        colsample_bytree=1,           # Subsample ratio of cols for each tree
                        colsample_bylevel=1,          # Subsample ratio of cols for each split
                        reg_alpha=0,                  # L1 regularization term on weights
                        reg_lambda=1,                 # L2 regularization term on weights
                        scale_pos_weight=1,           # Balancing class weights
                        base_score=0.5,               # Initial prediction score; global bias
                        random_state=42)              # random seed

oos_score = cross_val_score(
    forest, # base classifier
    trnsX, # train features
    cont['bin'], # train labels
    cv_gen = cv_gen, # purged k fold cross validation class
    scoring = accuracy_score # optimizing to accuracy score
).mean()
print(f"oos_score {oos_score}")


oos_score 0.5622768222768223


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
forest = GradientBoostingClassifier(loss='log_loss',
                                    learning_rate=0.1,
                                    n_estimators=100,
                                    subsample=1.0,
                                    criterion='friedman_mse',
                                    min_samples_split=2,
                                    min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0,
                                    # max_depth=3,
                                    min_impurity_decrease=0.0,
                                    # min_impurity_split=None,
                                    init=None,
                                    random_state=42,
                                    max_features=1,
                                    verbose=0,
                                    max_leaf_nodes=None,
                                    warm_start=False,
                                    # presort='auto',
                                    validation_fraction=0.1,
                                    n_iter_no_change=None,
                                    tol=0.0001)

param_grid = dict(
        learning_rate=[.005, .01, .1],
        # max_depth=list(range(3, 13, 3)),
        # max_features=['sqrt', .8, 1],
        min_impurity_decrease=[0, .01],
        min_samples_split=[2, 10, 50],
        n_estimators=[150, 100],
        subsample=[.8, 1])
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(forest,
                  param_grid,
                  # cv=accuracy_score,
                  scoring='roc_auc',
                  verbose=0,
                  n_jobs=-1,
                  return_train_score=True)
gs.fit(X = trnsX, y = cont['bin'])

print(f"param: {gs.best_params_}")
print(f"best_score: {gs.best_score_}")
print(f"best_estimator: {gs.best_estimator_}")

from quant_free.finml.cross_validation.cross_validation import PurgedKFold, cross_val_score
cv_gen = PurgedKFold(
    n_splits = 20, 
    samples_info_sets = cont['t1']
)
oos_score = cross_val_score(
    gs.best_estimator_, # base classifier
    trnsX, # train features
    cont['bin'], # train labels
    cv_gen = cv_gen, # purged k fold cross validation class
    scoring = accuracy_score # optimizing to accuracy score
).mean()
print(f"oos_score {oos_score}")


param: {'learning_rate': 0.1, 'min_impurity_decrease': 0, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.8}
best_score: 0.48999721390431333
best_estimator: GradientBoostingClassifier(max_features=1, min_impurity_decrease=0,
                           min_samples_split=10, random_state=42,
                           subsample=0.8)
oos_score 0.5818509418509419
