For this final project, we will be using various models to predict Apple (AAPL) stock data.

Unfortunately, Scikit-learn does not have LSTM, so we will go with the following strategies:
1. Linear Regression
2. Multi Layer Perceptron
3. SVM, specifically SVR

In [213]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler

# help us view progress of parallel processes
from tqdm.notebook import tqdm
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed, parallel_backend


In [214]:
RANDOM_STATE = 0

In [215]:
df = pd.read_csv('AAPL_data.csv')
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,67.7142,68.4014,66.8928,67.8542,158168416,AAPL
1,2013-02-11,68.0714,69.2771,67.6071,68.5614,129029425,AAPL
2,2013-02-12,68.5014,68.9114,66.8205,66.8428,151829363,AAPL
3,2013-02-13,66.7442,67.6628,66.1742,66.7156,118721995,AAPL
4,2013-02-14,66.3599,67.3771,66.2885,66.6556,88809154,AAPL


In [216]:
df.isna().sum()

date      0
open      0
high      0
low       0
close     0
volume    0
Name      0
dtype: int64

In [217]:
df.dtypes


date       object
open      float64
high      float64
low       float64
close     float64
volume      int64
Name       object
dtype: object

The name column is irrelevant, so we will drop it.

In [218]:
df.drop(columns=['Name'], inplace=True)

We need to deal with the date column. Since we are dealing with time series data, we may get temporal patterns out of them. We will use a technique called "cyclic feature encoding". This is useful since time data is cyclical: months roll over, time rolls over, etc. All code references are listed in the appendix.

In [219]:
df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

df.drop(columns=['date'], inplace=True)

df.dtypes

open      float64
high      float64
low       float64
close     float64
volume      int64
day         int32
month       int32
year        int32
dtype: object

In [220]:
def encodeCyclicalData(df, dateType, max):
    column_sin = dateType + '_sin'
    column_cos = dateType + '_cos'

    df[column_sin] = np.sin(2 * np.pi * df[dateType] / max)
    df[column_cos] = np.cos(2 * np.pi * df[dateType] / max)

    df.drop(columns=[dateType], inplace=True)
    
    return df


In [221]:
df = encodeCyclicalData(df=df, dateType='day', max=7)
df = encodeCyclicalData(df=df, dateType='month', max=12)
# years are not cyclical... yet




In [222]:
df.head()

Unnamed: 0,open,high,low,close,volume,year,day_sin,day_cos,month_sin,month_cos
0,67.7142,68.4014,66.8928,67.8542,158168416,2013,-0.433884,-0.900969,0.866025,0.5
1,68.0714,69.2771,67.6071,68.5614,129029425,2013,0.0,1.0,0.866025,0.5
2,68.5014,68.9114,66.8205,66.8428,151829363,2013,0.781831,0.62349,0.866025,0.5
3,66.7442,67.6628,66.1742,66.7156,118721995,2013,0.974928,-0.222521,0.866025,0.5
4,66.3599,67.3771,66.2885,66.6556,88809154,2013,0.433884,-0.900969,0.866025,0.5


Now, we want to have lagging data for this. Basically, instead of a row containing a day's data, we will have the target, and the previous day's data (or several in this case).
Since we are greedy, we will predict the high of the target day. Again, all sourced code is cited.

In [223]:
lagged_days = 3
for i in range(1, lagged_days + 1):
    df[f'open_-{i}_days'] = df['open'].shift(i)
    df[f'high_-{i}_days'] = df['high'].shift(i)
    df[f'low_-{i}_days'] = df['low'].shift(i)
    df[f'close_-{i}_days'] = df['close'].shift(i)
    df[f'volume_-{i}_days'] = df['volume'].shift(i)

df.dropna(inplace=True)

# we only want historical data, so current data really doesn't help much. We do want current day's high though.
columns_to_drop = ['open', 'low', 'close', 'volume']
df.drop(columns=columns_to_drop, inplace=True)


In [224]:
df.head()

Unnamed: 0,high,year,day_sin,day_cos,month_sin,month_cos,open_-1_days,high_-1_days,low_-1_days,close_-1_days,...,open_-2_days,high_-2_days,low_-2_days,close_-2_days,volume_-2_days,open_-3_days,high_-3_days,low_-3_days,close_-3_days,volume_-3_days
3,67.6628,2013,0.974928,-0.222521,0.866025,0.5,68.5014,68.9114,66.8205,66.8428,...,68.0714,69.2771,67.6071,68.5614,129029425.0,67.7142,68.4014,66.8928,67.8542,158168416.0
4,67.3771,2013,0.433884,-0.900969,0.866025,0.5,66.7442,67.6628,66.1742,66.7156,...,68.5014,68.9114,66.8205,66.8428,151829363.0,68.0714,69.2771,67.6071,68.5614,129029425.0
5,67.1656,2013,-0.433884,-0.900969,0.866025,0.5,66.3599,67.3771,66.2885,66.6556,...,66.7442,67.6628,66.1742,66.7156,118721995.0,68.5014,68.9114,66.8205,66.8428,151829363.0
6,66.1042,2013,0.781831,0.62349,0.866025,0.5,66.9785,67.1656,65.7028,65.7371,...,66.3599,67.3771,66.2885,66.6556,88809154.0,66.7442,67.6628,66.1742,66.7156,118721995.0
7,65.3842,2013,0.974928,-0.222521,0.866025,0.5,65.8714,66.1042,64.8356,65.7128,...,66.9785,67.1656,65.7028,65.7371,97924631.0,66.3599,67.3771,66.2885,66.6556,88809154.0


Now, let's create the training and testing sets.

In [225]:
features = list(df.columns)
features.remove('high')

target = 'high'

X = df[features]
y = df[target]

print(X.shape)
print(y.shape)

(1256, 20)
(1256,)


In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

We decided to use gridsearch to find the best of the 3 proposed techniques.
Robust scaler is used since it handles outliers better, and stock data will likely have outliers.

In [242]:
pipeline_lrg = Pipeline([
    ('scaler', RobustScaler()),
    ('lrg', LinearRegression()),
])

pipeline_mlr = Pipeline([
    ('scaler', RobustScaler()),
    ('mlr', MLPRegressor(max_iter=500, early_stopping=True, random_state=RANDOM_STATE)),
])

pipeline_svr = Pipeline([
    ('scaler', RobustScaler()),
    ('svr', SVR(max_iter=500)),  
])

alpha_params = list(np.logspace(-4, 2, 7))

gamma_range = np.logspace(-9, 3, 13)
gamma_params = gamma_range.tolist()
gamma_params.append('scale')
gamma_params.append('auto')

param_grid_lrg = {
    'lrg__fit_intercept': [True, False],
}

param_grid_mlr = {
    'mlr__hidden_layer_sizes': [1, 10, 100],
    'mlr__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'mlr__solver': ['sgd', 'adam'],
    'mlr__alpha': alpha_params,
}

param_grid_svr = {
    'svr__C': [0.1, 0.5, 1.0, 10.0, 100.0],
    'svr__gamma': gamma_params,
    'svr__kernel': ['linear', 'poly', 'rbf']
}

scoring = {
    'r2': make_scorer(r2_score),
    'mae': make_scorer(mean_absolute_error),
    'mse': make_scorer(mean_squared_error),
}



In [251]:
class TqdmGridSearchCV(GridSearchCV):
    def __init__(self, estimator, param_grid, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False):
        self.total_iterations = len(ParameterGrid(param_grid))
        super().__init__(estimator=estimator, param_grid=param_grid, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)

    # def _run_search(self, evaluate_candidates):
    #     with tqdm(total=self.total_iterations, dynamic_ncols=True, leave=True, file=sys.stdout) as pbar:
    #         def callback(params):
    #             # pbar.set_description(str(list(params)))
    #             pbar.update(1)
    #             return evaluate_candidates(params)
    #         super()._run_search(callback)

In [253]:

grid_search_lrg = GridSearchCV(estimator=pipeline_lrg, param_grid=param_grid_lrg, cv=5, scoring=scoring, refit='r2', verbose=3, n_jobs=-1, return_train_score=True)
grid_search_lrg.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [245]:
grid_search_mlr = TqdmGridSearchCV(estimator=pipeline_mlr, param_grid=param_grid_mlr, cv=5, scoring=scoring, refit='r2', verbose=3, n_jobs=-1, return_train_score=True)
grid_search_mlr.fit(X_train, y_train)

  0%|          | 0/168 [00:00<?, ?it/s]

Fitting 5 folds for each of 168 candidates, totalling 840 fits


In [246]:
grid_search_svr = TqdmGridSearchCV(estimator=pipeline_svr, param_grid=param_grid_svr, cv=5, scoring=scoring, refit='r2', verbose=3, n_jobs=-1, return_train_score=True)
grid_search_svr.fit(X_train, y_train)

  0%|          | 0/225 [00:00<?, ?it/s]

Fitting 5 folds for each of 225 candidates, totalling 1125 fits




Above, some grid searches failed to find convergence and finished prematurely. That is ok, as long as we get one set of parameters that fit the data well.

In [250]:

print(grid_search_lrg.best_params_)
print(grid_search_lrg.best_estimator_)
print(grid_search_lrg.best_score_)
print(grid_search_lrg.best_index_)
print('\n\n')

print(grid_search_mlr.best_params_)
print(grid_search_mlr.best_estimator_)
print(grid_search_mlr.best_score_)
print(grid_search_mlr.best_index_)


print('\n\n')

print(grid_search_svr.best_params_)
print(grid_search_svr.best_estimator_)
print(grid_search_svr.best_score_)
print(grid_search_svr.best_index_)

{'mean_fit_time': array([0.02760687, 0.0307178 , 0.04591298, 0.03899927, 0.03960218,
       0.03870974, 0.03210306, 0.03640604, 0.0477097 , 0.03730636,
       0.03360438, 0.0413043 , 0.03340454, 0.03370628, 0.04362173,
       0.0353086 , 0.03761272, 0.04410505, 0.03572049, 0.03520494,
       0.03812776, 0.0338037 , 0.03870649, 0.04653096, 0.03500485,
       0.03340211, 0.03992887, 0.03240418, 0.02260079, 0.04040804,
       0.03330302, 0.0198287 , 0.0527133 , 0.03993211, 0.01830168,
       0.0575604 , 0.03350177, 0.01922798, 0.0612452 , 0.03152866,
       0.03431249, 0.04261937, 0.03761449, 0.03561659, 0.04204836,
       0.03331618, 0.04071698, 0.04132175, 0.03311262, 0.03461185,
       0.03477755, 0.0345468 , 0.03758554, 0.03906946, 0.03526421,
       0.033426  , 0.0491745 , 0.03196945, 0.03568234, 0.04086103,
       0.03113737, 0.03713555, 0.04271207, 0.03393016, 0.0372159 ,
       0.04201798, 0.02792578, 0.03443017, 0.0479311 , 0.03402643,
       0.03541398, 0.03840671, 0.03580904, 0

In [248]:
predictions = grid_search.best_estimator_.predict(X_test)

AttributeError: 'TqdmGridSearchCV' object has no attribute 'best_estimator_'

In [None]:
paired_list = list(zip(predictions, y_test))

print(paired_list)

for prediction, actual in paired_list:
    print(f'predicted value: {prediction}, actual value: {actual}')

[(64.8190746627765, 64.1671), (119.44871866571147, 120.0), (62.76766980519956, 63.6071), (118.87714886524287, 119.43), (74.75285679618464, 74.7414), (63.39476871459344, 62.6971), (86.29914359549846, 86.7613), (132.9374900195893, 131.6), (97.32187050236638, 97.84), (130.81208650430742, 131.63), (127.44932032299977, 128.57), (154.61056005198822, 154.17), (114.90152853028185, 116.75), (113.31961098199793, 113.8), (97.50361475323501, 101.46), (151.14603051007282, 152.44), (93.7949934414612, 93.77), (161.63752470351477, 160.5), (62.45768834448155, 62.2428), (99.54995029130035, 100.69), (118.06473573320007, 117.44), (65.83840175454674, 66.9499), (144.88984560663718, 144.49), (110.5033188875492, 118.12), (159.63261906672236, 160.97), (80.61992441310801, 80.4113), (137.60172055644227, 140.15), (108.0669457153623, 107.44), (119.71807082002762, 119.25), (110.74828663865928, 112.56), (70.41527765447049, 70.3356), (96.11395391468729, 95.9), (146.87811670039486, 145.13), (57.12880617774937, 57.0857