In [2]:
import pandas as pd
import numpy as np
from DataLoader import DataLoader
from river import stream
from river import tree
from river import metrics
from river import preprocessing
from river import feature_selection
from river import drift
from river import stats
from collections import deque
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import warnings
from itertools import product

pd.options.display.max_columns = 500

class BollingerBandDriftDetector:
    def __init__(self, window_size=20, num_std=3.5, next_drift_delay=100):
        self.window_size = window_size
        self.num_std = num_std
        self.values = deque(maxlen=window_size)
        self.drift_detected = False
        self.next_drift_delay = next_drift_delay
        self.current_next_drift_delay = 0

    def update(self, value: float):
        self.values.append(value)
        self.drift_detected = False  # Reset flag
        self.current_next_drift_delay = max(0, self.current_next_drift_delay - 1)
        
        if len(self.values) < self.window_size:
            return  # Not enough data yet
        
        mean = np.mean(self.values)
        std = np.std(self.values)
        upper_band = mean + self.num_std * std
        lower_band = mean - self.num_std * std

        # If current value is outside bands -> drift
        if (value > upper_band or value < lower_band) and self.current_next_drift_delay == 0:
            self.drift_detected = True
            self.current_next_drift_delay = self.next_drift_delay # min of instances between next drifts

class FixedSizeBuffer:
    def __init__(self, max_size, num_features):
        self.max_size = max_size
        self.buffer = np.empty((0, num_features))
        self.labels = []

    def append(self, x):
        if len(self.buffer) >= self.max_size:
            self.buffer = np.delete(self.buffer, 0, axis=0)
            self.labels.pop(0)
        self.buffer = np.vstack([self.buffer, x])

    def append_label(self, y):
        self.labels.append(y)

    def get_data(self):
        return self.buffer, np.array(self.labels)

class StockPredictor:

    def __init__(self, stock_data, model_name, drift_name, feature_selector_name, 
                 provided_model=None, provided_detector=None, provided_feature_selector=None,
                 learning_threshold = 1000):
        self.stock_data = stock_data
        self.data_stream = StockPredictor.ohlc_stream(stock_data)
        self.metric = metrics.ClassificationReport()
        self.learning_threshold = learning_threshold
        self.drifts_detected = 0

        self.model_name = model_name
        self.provided_model = provided_model
        if provided_model:
            self.model = provided_model
            self.is_incremental = hasattr(provided_model, 'learn_one')
        else:
            self.model, self.is_incremental = StockPredictor.get_model(model_name)

        if self.is_incremental:
            self.feature_selector_name = feature_selector_name
            self.feature_selector = provided_feature_selector or StockPredictor.get_feature_selector(feature_selector_name)
            self.pipeline = StockPredictor.get_pipeline(self.model, self.feature_selector)
            
        else:
            self.feature_selector_name = feature_selector_name
            self.feature_selector = provided_feature_selector or StockPredictor.get_feature_selector(feature_selector_name)
            self.pipeline = StockPredictor.get_sklearn_pipeline(self.model, self.feature_selector)

        self.drift_name = drift_name
        self.drift_detector = provided_detector or StockPredictor.get_drift_detector(drift_name)

    @staticmethod
    def ohlc_stream(df):
        for _, row in df.iterrows():
            features = row.iloc[:-1].to_dict()
            yield features, row['target']

    @staticmethod
    def get_model(name: str):
        name = name.lower()
        if name == 'hoeffdingtreeclassifier':
            return tree.HoeffdingTreeClassifier(), True
        if name == 'extremelyfastdecisiontreeclassifier':
            return tree.ExtremelyFastDecisionTreeClassifier(), True
        
        # non incremental below:
        if name == 'mlp':
            return MLPClassifier(hidden_layer_sizes=(50,), learning_rate_init=1e-4, max_iter=200), False
        if name == 'xgboost':
            return XGBClassifier(), False
        if name == 'lgbm':
            return LGBMClassifier(verbosity=0), False
        if name == 'randomforest':
            return RandomForestClassifier(), False
        else:
            raise ValueError(f"Unknown model")

    @staticmethod
    def get_drift_detector(name: str):
        name = name.lower()
        if name == "adwin":
            return drift.ADWIN()
        elif name == "kswin":
            return drift.KSWIN()
        elif name == "dummydriftdetector":
            return drift.DummyDriftDetector()
        elif name == "pagehinkley":
            return drift.PageHinkley()
        elif name == 'bollingerband':
            return BollingerBandDriftDetector()
        else:
            raise ValueError(f"Unknown detector")

    @staticmethod
    def get_feature_selector(name: str):
        name = name.lower()
        if name == "selectkbest":
            return feature_selection.SelectKBest(similarity=stats.PearsonCorr(), k=15)
        elif name == 'selectkbest_sklearn':
            return SelectKBest(score_func=f_classif, k=15)
        else:
            raise ValueError(f"Unknown selector")

    @staticmethod
    def get_pipeline(model, feature_selector):
        scaler = preprocessing.StandardScaler()
        pipeline = scaler | feature_selector | model
        return pipeline

    @staticmethod
    def get_sklearn_pipeline(model, feature_selector):
        scaler = MinMaxScaler()
        selector = feature_selector
        model = model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('selector', selector),
            ('mlp', model)
        ])
    
        return pipeline
    
    def prediction(self):

        if self.is_incremental:
            for i, (x, y) in enumerate(self.data_stream):
                close_value = float(self.stock_data.loc[i, 'close'])
                if i >= self.learning_threshold:

                    y_pred = self.pipeline.predict_one(x)
                    self.pipeline.learn_one(x, y)

                    error = int(y_pred != y) if y_pred is not None else 0
                    if self.drift_name == 'bollingerband':
                        self.drift_detector.update(close_value)
                    else:
                        self.drift_detector.update(error)

                    if y_pred is not None:
                        self.metric.update(y, y_pred)

                    if self.drift_detector.drift_detected:
                        self.drifts_detected += 1
                        # print(f'Drift detected at index {i}! ({self.drift_name})')

                        # resets model
                        if self.provided_model:
                            self.model = self.provided_model
                        else:
                            self.model, _ = StockPredictor.get_model(self.model_name)

                        self.pipeline = StockPredictor.get_pipeline(self.model, self.feature_selector)
        
        else:

            buffer = FixedSizeBuffer(max_size=self.learning_threshold, num_features=self.stock_data.shape[1] - 1)
            for i, (x, y) in enumerate(self.data_stream):
                close_value = float(self.stock_data.loc[i, 'close'])
                x_array = np.array(list(x.values())).reshape(1, -1)
                buffer.append(x_array[0])
                buffer.append_label(y)
                
                if i >= self.learning_threshold and (i % self.learning_threshold == 0 or self.drift_detector.drift_detected):
                    X_train, y_train = buffer.get_data()
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        self.pipeline.fit(X_train, y_train) # MLP produces warnings!

                if i >= self.learning_threshold:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        y_pred = self.pipeline.predict(x_array)[0]
                    error = int(y_pred != y)
                    if self.drift_name == 'bollingerband':
                        self.drift_detector.update(close_value)
                    else:
                        self.drift_detector.update(error)
                        
                    self.metric.update(y, y_pred)

                    if self.drift_detector.drift_detected:
                        self.drifts_detected += 1
                        # print(f'Drift detected at index {i}! ({self.drift_name})')


        accuracy, metrics_result = self.get_metrics()
        # print(f'accuracy: {accuracy}')
        # display(metrics_result)
        return accuracy, metrics_result

    def get_metrics(self):

        classes = sorted(self.metric.cm.classes)

        for c in classes:
            if c not in self.metric._f1s:
                self.metric._f1s[c] = metrics.F1(cm=self.metric.cm, pos_val=c)
        
        accuracy = round(self.metric._accuracy.get(), 3)

        # print(self.metric._f1s)
        metrics_result = pd.DataFrame([ [0, self.metric._f1s[0].precision.get(), self.metric._f1s[0].recall.get(), self.metric._f1s[0].get()],
                                        [1, self.metric._f1s[1].precision.get(), self.metric._f1s[1].recall.get(), self.metric._f1s[1].get()]],
                                        columns=['class', 'precision', 'recall', 'f1'])
        
        metrics_result = metrics_result.round(3)

        return accuracy, metrics_result
    


### DATA LOADING

In [None]:
ticker = 'AAPL'
dataLoader = DataLoader() # if yahoo does not work use "dataLoader.get_data_locally('AAPL')"
stock_data = dataLoader.pipeline(ticker)

In [15]:
stock_data

Unnamed: 0,open,close,high,low,volume,max_5,min_5,max_10,min_10,max_20,min_20,max_40,min_40,max_80,min_80,max_125,min_125,max_250,min_250,max_500,min_500,max_lifetime,min_lifetime,sma_5,sma_10,sma_20,sma_50,sma_100,sma_200,daily_return,1_week_return,2_weeks_return,1_month_return,6_months_return,12_months_return,day_variation,day_change,downward_pressure,upward_pressure,rsi,macd,ppo,stochastic_fast,stochastic_slow,%r,atr,cmo,cci,mom,bias,wnr,target
0,0.064816,0.064816,0.065245,0.064816,34272000,0.065674,0.063528,0.065674,0.063099,0.066962,0.061382,0.068679,0.061382,0.074688,0.048934,0.111174,0.048934,0.123622,0.048934,0.123622,0.048934,0.123622,0.048934,0.064730,0.064386,0.064043,0.063751,0.067984,0.081131,0.006666,0.006666,0.048608,0.027209,-0.376038,-0.343480,0.006622,0.000000,-0.006579,0.000000,50.000000,0.000137,0.213123,69.988073,73.320574,42.870975,0.001196,33.333884,38.810183,0.003005,0.666625,-0.333331,0
1,0.064816,0.064816,0.065245,0.064816,37408000,0.065674,0.064386,0.065674,0.063528,0.066962,0.061382,0.068679,0.061382,0.074688,0.048934,0.109028,0.048934,0.123622,0.048934,0.123622,0.048934,0.123622,0.048934,0.064987,0.064558,0.064043,0.064000,0.067808,0.081015,0.000000,0.020269,0.027209,0.000000,-0.351933,-0.307344,0.006622,0.000000,-0.006579,0.000000,50.000000,0.000160,0.249286,69.988073,66.655340,50.014735,0.001196,22.222704,38.746981,0.001717,0.398909,-0.388886,0
2,0.064816,0.064386,0.065245,0.064386,76092800,0.065674,0.064386,0.065674,0.063528,0.065674,0.061382,0.068679,0.061382,0.074688,0.048934,0.107311,0.048934,0.123622,0.048934,0.123622,0.048934,0.123622,0.048934,0.064816,0.064515,0.063914,0.064241,0.067675,0.080882,-0.006622,-0.013157,-0.006622,-0.038470,-0.367089,-0.257427,0.013333,-0.006622,-0.013157,0.000000,46.666605,0.000143,0.221614,59.989876,66.655340,66.676683,0.001196,-6.666759,31.135463,-0.000429,-0.199587,-0.533334,0
3,0.063099,0.062240,0.063099,0.062240,25244800,0.064816,0.06224,0.065674,0.06224,0.065674,0.061382,0.068679,0.061382,0.074688,0.048934,0.106882,0.048934,0.123622,0.048934,0.123622,0.048934,0.123622,0.048934,0.064129,0.064343,0.063914,0.064352,0.067499,0.080736,-0.033331,-0.052284,-0.026844,0.000000,-0.372294,-0.299516,0.013792,-0.013605,-0.013605,0.000000,50.000000,-0.000044,-0.068957,9.998631,46.658860,100.000000,0.001134,-22.222592,-88.315612,-0.001717,-3.268640,-0.611113,1
4,0.063957,0.063957,0.064386,0.063957,31315200,0.064816,0.06224,0.065674,0.06224,0.065674,0.061811,0.068679,0.061382,0.074688,0.048934,0.103877,0.048934,0.123622,0.048934,0.123622,0.048934,0.123622,0.048934,0.064043,0.064343,0.064043,0.064464,0.067314,0.080606,0.027584,-0.006666,0.000000,0.041956,-0.368644,-0.300475,0.006711,0.000000,-0.006666,0.000000,58.064651,-0.000053,-0.082774,44.436044,38.141517,55.563956,0.001257,0.000000,-7.831600,0.000000,-0.600358,-0.500000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10937,206.089996,205.350006,206.990005,202.160004,101010600,213.320007,205.350006,213.320007,193.160004,213.320007,172.419998,239.070007,172.419998,247.100006,172.419998,258.735504,172.419998,258.735504,172.419998,258.735504,164.224564,258.735504,0.037773,210.504004,206.767003,199.663002,215.589601,228.001728,226.786334,-0.037362,-0.018779,0.042492,0.010630,-0.075961,0.192392,0.023892,-0.003591,-0.007923,0.015780,57.614226,-0.723095,-0.348525,62.787913,84.968448,37.212087,6.983572,26.197218,28.654023,8.370010,-0.685311,-0.369014,0
10938,203.100006,198.889999,204.100006,198.210007,69018500,213.320007,198.889999,213.320007,198.889999,213.320007,172.419998,227.479996,172.419998,247.100006,172.419998,258.735504,172.419998,258.735504,172.419998,258.735504,164.224564,258.735504,0.037773,208.254004,207.340002,200.188502,214.656401,227.564895,226.640217,-0.031459,-0.053536,0.029665,0.055791,-0.123757,0.089700,0.029716,-0.020729,-0.025527,0.003431,46.323673,-1.296527,-0.626835,36.686876,65.041315,84.292637,6.437143,16.565463,3.639740,5.729996,-4.075433,-0.417173,0
10939,198.210007,198.509995,200.649994,197.020004,51216500,213.320007,198.509995,213.320007,198.509995,213.320007,172.419998,223.889999,172.419998,247.100006,172.419998,258.735504,172.419998,258.735504,172.419998,258.735504,164.224564,258.735504,0.037773,205.714001,207.217001,201.041001,213.684601,227.085205,226.515622,-0.001911,-0.060130,-0.006158,0.093960,-0.124391,0.097613,0.018424,0.001513,-0.010665,0.007563,46.323673,-1.761336,-0.854104,35.151503,44.875431,91.505184,6.431428,-4.332548,-11.938346,-1.230011,-4.201879,-0.521663,0
10940,199.169998,196.250000,199.440002,193.250000,68616900,213.320007,196.25,213.320007,196.25,213.320007,190.419998,223.889999,172.419998,247.100006,172.419998,258.735504,172.419998,258.735504,172.419998,258.735504,164.224564,258.735504,0.037773,202.464001,206.382001,202.232501,212.668801,226.572727,226.379079,-0.011385,-0.076471,-0.040811,0.138209,-0.123820,0.081012,0.032031,-0.014661,-0.015995,0.015524,52.262335,-2.285714,-1.112368,26.020212,32.619530,85.922101,6.175714,-32.376903,-40.056531,-8.350006,-4.909343,-0.661885,1


### PREDICTION (EXAMPLE USAGE)

In [4]:
stock_predictor = StockPredictor(stock_data=stock_data, 
                                 model_name='hoeffdingtreeclassifier',
                                 drift_name='adwin',
                                 feature_selector_name='selectkbest',
                                 learning_threshold = 1000
                                 )
stock_predictor.prediction()

(0.522,
    class  precision  recall     f1
 0      0      0.518   0.471  0.494
 1      1      0.525   0.572  0.547)

### GRID SEARCH REPORT

In [None]:
# Models
HT_model = tree.HoeffdingTreeClassifier(grace_period=200, max_depth=None, delta=1e-7)
EFDT_model = tree.ExtremelyFastDecisionTreeClassifier(grace_period=200, max_depth=None, delta=1e-7)
MLPClassifier(hidden_layer_sizes=(50,), learning_rate_init=1e-4, max_iter=200)
XGBClassifier()
LGBMClassifier(verbosity=0)
RandomForestClassifier()

# Feature selection
k_best_selector = feature_selection.SelectKBest(k=7, similarity=stats.PearsonCorr())

# Drift detectors
adwin_detector = drift.ADWIN(delta=0.002, clock=32, max_buckets=5, min_window_length=5, grace_period=10)
kswin_detector = drift.KSWIN(alpha = 0.005, window_size = 100, stat_size = 30)
page_hinley_detector = drift.PageHinkley(min_instances = 30, delta = 0.005, threshold = 50.0, alpha = 0.9999, mode = "both")

In [5]:
def provide_hoeffdingtreeclassifier(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = tree.HoeffdingTreeClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

def provide_extremelyfastdecisiontreeclassifier(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = tree.ExtremelyFastDecisionTreeClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

def provide_mlp(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = MLPClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

def provide_xgboost(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = XGBClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

def provide_lgbm(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = LGBMClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

def provide_randomforest(args, args_values):
    models = []
    arg_strings = []

    for values in product(*args_values):
        kwargs = dict(zip(args, values))
        model = RandomForestClassifier(**kwargs)
        models.append(model)

        # Create readable string representation
        args_str = ', '.join(f"{key}={value}" for key, value in kwargs.items())
        arg_strings.append(args_str)

    return models, arg_strings

In [6]:

result_rows = []
iterations = 5

for model_name in ['randomforest']:#'hoeffdingtreeclassifier', 'extremelyfastdecisiontreeclassifier', 'mlp', 'xgboost', 'lgbm', 

    if model_name == 'hoeffdingtreeclassifier':
        models, args_strs = provide_hoeffdingtreeclassifier(
            ['grace_period', 'max_depth', 'delta'],
            [[100, 200, 300], [4, 8, 12], [1e-3, 1e-5, 1e-7]]
        )
    if model_name == 'extremelyfastdecisiontreeclassifier':
        models, args_strs = provide_extremelyfastdecisiontreeclassifier(
            ['grace_period', 'max_depth', 'delta'],
            [[100, 200, 300], [4, 8, 12], [1e-3, 1e-5, 1e-7]]
        )
    if model_name == 'mlp':
        models, args_strs = provide_mlp(
            ['hidden_layer_sizes', 'learning_rate_init', 'alpha'],
            [[(50,), (100,), (150,)], [0.001, 0.01, 0.1], [1e-5, 1e-4, 1e-3]]
        )
    if model_name == 'xgboost':
        models, args_strs = provide_xgboost(
            ['n_estimators', 'max_depth', 'learning_rate'],
            [[100, 200, 300], [3, 6, 9], [0.01, 0.1, 0.2]]
        )
    # if model_name == 'lgbm':
    #     models, args_strs = provide_lgbm(
    #         ['n_estimators', 'max_depth', 'learning_rate'],
    #         [[100, 200, 300], [3, 6, 9], [0.01, 0.1, 0.2]]
    #    )
    if model_name == 'randomforest':
        models, args_strs = provide_randomforest(
            ['n_estimators', 'max_depth', 'max_features'],
            [[100, 200, 300], [None, 10, 20], ['sqrt', 'log2']]
        )

    feature_selector_name = 'selectkbest' if model_name in ['hoeffdingtreeclassifier', 'extremelyfastdecisiontreeclassifier'] else 'selectkbest_sklearn'

    for model, model_args in zip(models, args_strs):

        for drift_name in ['adwin', 'kswin', 'dummydriftdetector', 'pagehinkley', 'bollingerband']:

            for learning_threshold in [100, 200, 500, 1000, 2000]:

                for iteration in range(iterations):

                    print('''
model_name = {model_name}
model_args = {model_args}
drift_name = {drift_name}
learning_threshold = {learning_threshold}
iteration = {iteration}
                        '''.format(
model_name=model_name,
model_args=model_args,
drift_name=drift_name,
learning_threshold=learning_threshold,
iteration=iteration+1
))
                    
                    stock_predictor = StockPredictor(stock_data=stock_data,
                                    model_name=model_name,
                                    drift_name=drift_name,
                                    feature_selector_name=feature_selector_name,
                                    learning_threshold = learning_threshold
                                    )
                    
                    accuracy, metrics_result = stock_predictor.prediction()
                    
                    result_rows.append([model_name, drift_name, feature_selector_name, learning_threshold, iteration+1, round(accuracy, 3), stock_predictor.drifts_detected, model_args, ticker])
                    print('accuracy = {:.4f}'.format(accuracy))

                    result_df = pd.DataFrame(result_rows, columns=['model_name', 'drift_name', 'feature_selector_name', 'learning_threshold', 'iteration', 'accuracy', 'drifts_detected', 'model_args', 'ticker'])
                    result_df.to_csv('results_df_2.csv', index=False)



model_name = randomforest
model_args = n_estimators=100, max_depth=None, max_features=sqrt
drift_name = adwin
learning_threshold = 100
iteration = 1
                        
accuracy = 0.5020

model_name = randomforest
model_args = n_estimators=100, max_depth=None, max_features=sqrt
drift_name = adwin
learning_threshold = 100
iteration = 2
                        
accuracy = 0.5050

model_name = randomforest
model_args = n_estimators=100, max_depth=None, max_features=sqrt
drift_name = adwin
learning_threshold = 100
iteration = 3
                        
accuracy = 0.5030

model_name = randomforest
model_args = n_estimators=100, max_depth=None, max_features=sqrt
drift_name = adwin
learning_threshold = 100
iteration = 4
                        
accuracy = 0.5000

model_name = randomforest
model_args = n_estimators=100, max_depth=None, max_features=sqrt
drift_name = adwin
learning_threshold = 100
iteration = 5
                        
accuracy = 0.5010

model_name = randomforest
model_ar