In [92]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier as RFC
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler


In [93]:
df = pd.read_csv('../data/stocks_quarterly.csv')

df2 = pd.read_csv('../data/new_data_test.csv')

df = pd.concat([df,df2])

cols_to_drop = df.isnull().mean().sort_values(ascending = False).head(45).index

cols_to_drop = list(cols_to_drop) + ['fiscalDateEnding','reportedDate','price','nasd_price',\
                                     'next_year_date','next_year_price','nasd_ny_price','symbol',\
                                     'Nasdaq_Performance', 'Stock_Performance']

df.drop(columns = cols_to_drop, inplace = True)

df = df.drop_duplicates()

X = df.drop(columns = 'Label')
y = df.Label

undersample = RandomUnderSampler(sampling_strategy=0.75)
X, y = undersample.fit_resample(X, y)


In [94]:
pipline = make_pipeline(SimpleImputer(strategy='median'))

X = pipline.fit_transform(X)

In [95]:
logger = JSONLogger(path="./logs.json")

In [96]:
def rfc_cv(min_child_weight,max_bin,max_leaves,eta,gamma,colsample_bytree,alpha, data, targets):
    """Random Forest cross validation.
    This function will instantiate a random forest classifier with parameters
    n_estimators, min_samples_split, and max_features. Combined with data and
    targets this will in turn be used to perform cross validation. The result
    of cross validation is returned.
    Our goal is to find combinations of n_estimators, min_samples_split, and
    max_features that minimzes the log loss.
    """
    estimator = XGBClassifier( 
                               min_child_weight = min_child_weight,
                               max_bin = max_bin,
                               max_leaves = max_leaves,
                               eta = eta,
                               gamma = gamma,
                               colsample_bytree = colsample_bytree,
                               alpha = alpha,
                               eval_metric='mlogloss',
                             use_label_encoder=False)

    cv = cross_validate(estimator, data, targets,
                           scoring=['precision','recall'], cv=4)
    cval = (cv['test_precision'].mean() * 2 + cv['test_recall'].mean())/3
    
    return cval.mean()

In [97]:
def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(min_child_weight,max_bin,max_leaves,eta,gamma,colsample_bytree,alpha,):
        """Wrapper of RandomForest cross validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return rfc_cv(
    
           min_child_weight = int(min_child_weight),
           max_bin = int(max_bin),
           max_leaves = int(max_leaves),
           eta = eta,
           gamma = gamma,
           colsample_bytree = colsample_bytree,
           alpha = alpha,
            data=X,
            targets=y,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
                  'min_child_weight':(0,20),
                  'max_bin': (30,300),#int
                  'max_leaves':(32,1024),
                  'eta':(0.01,0.3),
                  'gamma':(0,5),
                  'colsample_bytree':(0.2,1),
                  'alpha':(0,5),
        },
        random_state=1234,
        verbose=2
    )
    
    optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
    optimizer.maximize(init_points = 30, n_iter=3000)
    

    print("Final result:", optimizer.max)

In [None]:
optimizer  = optimize_rfc(X, y)