In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier as RFC

In [2]:
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

In [5]:
df = pd.read_csv('../data/stocks_quarterly.csv')

df2 = pd.read_csv('../data/new_data_test.csv')

df = pd.concat([df,df2])

In [6]:
cols_to_drop = df.isnull().mean().sort_values(ascending = False).head(45).index

cols_to_drop = list(cols_to_drop) + ['fiscalDateEnding','reportedDate','price','nasd_price',\
                                     'next_year_date','next_year_price','nasd_ny_price','symbol',\
                                     'Nasdaq_Performance', 'Stock_Performance']

df.drop(columns = cols_to_drop, inplace = True)

In [7]:
df = df.drop_duplicates()

In [8]:
X = df.drop(columns = 'Label')
y = df.Label

In [9]:
pipline = make_pipeline(SimpleImputer(strategy='median'))

In [10]:
X = pipline.fit_transform(X)

In [11]:
def rfc_cv(n_estimators, min_samples_split, max_features,max_depth,min_samples_leaf, data, targets):
    """Random Forest cross validation.
    This function will instantiate a random forest classifier with parameters
    n_estimators, min_samples_split, and max_features. Combined with data and
    targets this will in turn be used to perform cross validation. The result
    of cross validation is returned.
    Our goal is to find combinations of n_estimators, min_samples_split, and
    max_features that minimzes the log loss.
    """

    estimator = RFC(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        max_depth = max_depth,
        min_samples_leaf = min_samples_leaf,
        random_state=2
    )
    cv = cross_validate(estimator, data, targets,
                           scoring=['precision','recall'], cv=4)
    cval = (cv['test_precision'].mean() * 2 + cv['test_recall'].mean())/3
    
    return cval.mean()

In [12]:
def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(n_estimators, min_samples_split, max_features,max_depth,min_samples_leaf):
        """Wrapper of RandomForest cross validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.99), 1e-3),
            max_depth = int(max_depth),
            min_samples_leaf = int(min_samples_leaf),
            data=X,
            targets=y,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (10, 400),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.99),
            'max_depth' : (3,200),
            'min_samples_leaf' : (2,10)
        },
        random_state=1234,
        verbose=2
    )
    
    optimizer.maximize(init_points = 30, n_iter=1000)

    print("Final result:", optimizer.max)

In [13]:
optimizer  = optimize_rfc(X, y)

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.4178  [0m | [0m 40.73   [0m | [0m 0.6537  [0m | [0m 5.502   [0m | [0m 20.06   [0m | [0m 314.2   [0m |
| [95m 2       [0m | [95m 0.4209  [0m | [95m 56.7    [0m | [95m 0.3461  [0m | [95m 8.415   [0m | [95m 24.04   [0m | [95m 351.6   [0m |
| [0m 3       [0m | [0m 0.4055  [0m | [0m 73.49   [0m | [0m 0.5459  [0m | [0m 7.468   [0m | [0m 18.39   [0m | [0m 154.4   [0m |
| [0m 4       [0m | [0m 0.4085  [0m | [0m 113.6   [0m | [0m 0.5477  [0m | [0m 2.11    [0m | [0m 19.78   [0m | [0m 354.2   [0m |
| [0m 5       [0m | [0m 0.4154  [0m | [0m 74.88   [0m | [0m 0.6477  [0m | [0m 2.603   [0m | [0m 10.48   [0m | [0m 373.9   [0m |
| [95m 6       [0m | [95m 0.4248  [0m | [95m 131.3   [0m | [95m 0.4535  [0m | [95m 8.31    [0m | [95

KeyboardInterrupt: 

In [57]:
logger = JSONLogger(path="./logs.json")
