In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate


DATA_DIR = (os.getcwd() + "/data/")
start_date = '1999-01-01'
end_date = "2019-12-31"
# Get fundamental data for each stock in the ticker and append to the dataframe


def get_all_symbols():
    return [v.strip('.csv') for v in os.listdir(DATA_DIR)]


tickers = get_all_symbols()


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
tickers = ['AAPL']
for ticker in tickers:
    ticker_data = pd.read_csv(f"data/{ticker}.csv", index_col=0)
    ticker_data['Next Day Close'] = ticker_data['Close'].shift(-1)
    ticker_data.dropna(inplace=True)
    # print(ticker_data.head())
    features = ticker_data[['Volume', 'Close',
                            'Returns', 'Short Term Reversal', 'Stock Momentum',
                            'Long Term Reversal', 'Market_Beta', 'Turnover Volatility', 'Dividends',
                            'Total Returns', 'Total Return Volatility', 'SMA_5', 'SMA_20', 'SMA_50',
                            'SMA_252', 'adv20', 'VWAP', 'log_returns', 'volatility_30',
                            'volatility_60', 'annual_volatility', 'RSI(2)', 'RSI(7)', 'RSI(14)',
                            'CCI(30)', 'CCI(50)', 'CCI(100)', 'BBWidth', 'Williams']]

    target = ticker_data['Next Day Close']

    X = np.array(features).reshape(-1, 29)  # input as columns
    y = np.array(target)  # output as rows

    feature_train, feature_test, target_train, target_test = train_test_split(
        X, y, test_size=0.3, random_state=0)

    # Building RF model
    random_forest = RandomForestRegressor(
        n_jobs=-1, random_state=123, oob_score=True)
    param_grid = {
        'n_estimators': [100, 125, 130, 150],
        'max_depth': [15, 10, 20, 25, None],
        'min_samples_leaf': [75, 100, 125],
        'criterion': ['absolute_error', 'squared_error', 'friedman_mse'],
        'max_features': [None, 'sqrt', 'log2'],
    }

    grid_search = GridSearchCV(
        estimator=random_forest, param_grid=param_grid, cv=5)
    grid_search.fit(feature_train, target_train)

    print(grid_search.best_params_)

    criterion, max_depth, max_features, min_samples_leaf, n_estimators = grid_search.best_params_.values()

    # Building MLP model
    mlp = MLPRegressor(max_iter=1000)

    param_grid = {
        'hidden_layer_sizes': [(50, 50), (100, 50, 25), (100, 100), (50, 50, 50)],
        'activation': ['relu', 'tanh', 'identity'],
        # 'learning_rate': ['constant', 'adaptive'],
        'alpha': [0.0001, 0.001, 0.01],
    }

    # Create a GridSearchCV instance
    grid_search = GridSearchCV(mlp, param_grid, cv=5,
                               n_jobs=-1, scoring='neg_mean_squared_error')

    # Fit the GridSearchCV to find the best hyperparameters
    grid_search.fit(feature_train, target_train)

    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print(best_params)
    base_models = [
        ('rf', RandomForestRegressor(criterion=criterion, max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators,
                                     oob_score=True, random_state=123, n_jobs=-1)),
        ('nn', MLPRegressor(max_iter=1000, **best_params))
    ]

    # base_models = [
    #    ('rf', RandomForestRegressor(criterion='absolute_error', max_depth=10, max_features='sqrt', min_samples_leaf=75, n_estimators=130,
    #                                 oob_score=True, random_state=123, n_jobs=-1)),
    #    ('nn', MLPRegressor(max_iter=1000, activation='tanh', alpha=0.001, hidden_layer_sizes=(50, 50, 50)))
    # ]

    meta_learner = GradientBoostingRegressor(
        random_state=42, n_estimators=750, learning_rate=0.01, max_depth=5, max_features='sqrt')
    stack_model = StackingRegressor(
        estimators=base_models, final_estimator=meta_learner)

    # Grid Search for meta learner
    param_grid = {
        'final_estimator__n_estimators': [250, 225, 275],
        'final_estimator__learning_rate': [0.01, 0.05, 0.1, 0.2],
        # Example parameter choices
        'final_estimator__max_depth': [6, 5, 8, None]
    }

    grid_search = GridSearchCV(estimator=stack_model, param_grid=param_grid,
                               cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(feature_train, target_train)
    print(grid_search.best_params_)

    # best_params = grid_search.best_params_
    best_meta_learner = grid_search.best_estimator_

    final_stack_model = StackingRegressor(
        estimators=base_models, final_estimator=best_meta_learner)
    # Fit the stacking model on the training data
    final_stack_model.fit(feature_train, target_train)

    # Make predictions on the validation set
    y_pred = final_stack_model.predict(feature_test)

    # print(confusion_matrix(target_test, grid_search_predictions))
    print(f"Mean Absolute Error: {mean_absolute_error(target_test, y_pred)}")
    print(f"Mean Squared Error: {mean_squared_error(target_test, y_pred)}")
    print(f"R^2: {r2_score(target_test, y_pred)}")

    # Add the predictions to the dataframe only for the test data, avoid look ahead bias
    ticker_data['Next Day Close Predictions'] = np.nan
    ticker_data.iloc[-len(y_pred):,
                     ticker_data.columns.get_loc('Next Day Close Predictions')] = y_pred
    ticker_data[['Next Day Close', 'Next Day Close Predictions']].plot(
        figsize=(15, 5))

    data = ticker_data[['Next Day Close', 'Next Day Close Predictions']]
    data.dropna(inplace=True)
    data.index = pd.to_datetime(data.index)
    data.resample('M').prod().plot(figsize=(15, 5))
