# Full Process

In [1]:
import pandas as pd

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
# Import data
df = pd.read_csv('files/datasets/AAPL.csv')

# Convert date into datetime data type
df['Date'] = pd.to_datetime(df['Date'])

# Get 2019 data only and fix the index
df_2019 = df[(df['Date'].dt.year >= 2016)].copy()
df_2019.index = range(len(df_2019))

In [3]:
df_copy = df_2019.copy()

In [4]:
df_2019.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-01-04,25.6525,26.342501,25.5,26.3375,24.111498,270597600
1,2016-01-05,26.4375,26.4625,25.602501,25.6775,23.50728,223164000
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.047255,273829600
3,2016-01-07,24.67,25.032499,24.1075,24.112499,22.074543,324377600
4,2016-01-08,24.637501,24.7775,24.190001,24.24,22.191273,283192000


In [5]:
# Create Moving Average Indicators

df_2019['EMA_9'] = df_2019['Close'].ewm(9).mean().shift()
df_2019['SMA_5'] = df_2019['Close'].rolling(5).mean().shift()
df_2019['SMA_10'] = df_2019['Close'].rolling(10).mean().shift()
df_2019['SMA_15'] = df_2019['Close'].rolling(15).mean().shift()
df_2019['SMA_30'] = df_2019['Close'].rolling(30).mean().shift()

In [6]:
df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30
0,2016-01-04,25.652500,26.342501,25.500000,26.337500,24.111498,270597600,,,,,
1,2016-01-05,26.437500,26.462500,25.602501,25.677500,23.507280,223164000,26.337500,,,,
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.047255,273829600,25.990132,,,,
3,2016-01-07,24.670000,25.032499,24.107500,24.112499,22.074543,324377600,25.689345,,,,
4,2016-01-08,24.637501,24.777500,24.190001,24.240000,22.191273,283192000,25.230826,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1758,2022-12-27,131.380005,131.410004,128.720001,130.029999,130.029999,69007800,138.976031,132.841998,136.839000,138.912001,143.646999
1759,2022-12-28,129.669998,131.029999,125.870003,126.040001,126.040001,85438400,138.081427,132.373999,135.392999,137.805334,142.991333
1760,2022-12-29,127.989998,130.479996,127.730003,129.610001,129.610001,75703700,136.877285,131.121999,133.449999,136.680667,142.249999
1761,2022-12-30,128.410004,129.949997,127.430000,129.929993,129.929993,76960600,136.150556,129.954000,132.089999,135.925333,141.569000


In [7]:
# Create RSI Indicator
def relative_strength_idx(df, n=14):
    close = df['Close']
    delta = close.diff()
    delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

df_2019['RSI'] = relative_strength_idx(df_2019).fillna(0)
df_2019.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI
0,2016-01-04,25.6525,26.342501,25.5,26.3375,24.111498,270597600,,,,,,
1,2016-01-05,26.4375,26.4625,25.602501,25.6775,23.50728,223164000,26.3375,,,,,0.0
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.047255,273829600,25.990132,,,,,0.0
3,2016-01-07,24.67,25.032499,24.1075,24.112499,22.074543,324377600,25.689345,,,,,0.0
4,2016-01-08,24.637501,24.7775,24.190001,24.24,22.191273,283192000,25.230826,,,,,0.0


In [8]:
# Create MACD Indicator

EMA_12 = pd.Series(df_2019['Close'].ewm(span=12, min_periods=12).mean())
EMA_26 = pd.Series(df_2019['Close'].ewm(span=26, min_periods=26).mean())
df_2019['MACD'] = pd.Series(EMA_12 - EMA_26)
df_2019['MACD_signal'] = pd.Series(df_2019.MACD.ewm(span=9, min_periods=9).mean())

df_2019.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-01-04,25.6525,26.342501,25.5,26.3375,24.111498,270597600,,,,,,,,
1,2016-01-05,26.4375,26.4625,25.602501,25.6775,23.50728,223164000,26.3375,,,,,0.0,,
2,2016-01-06,25.139999,25.592501,24.967501,25.174999,23.047255,273829600,25.990132,,,,,0.0,,
3,2016-01-07,24.67,25.032499,24.1075,24.112499,22.074543,324377600,25.689345,,,,,0.0,,
4,2016-01-08,24.637501,24.7775,24.190001,24.24,22.191273,283192000,25.230826,,,,,0.0,,


In [9]:
# Shift Labels

df_2019['Close'] = df_2019['Close'].shift(-1)
df_2019['Close'].head()

0    25.677500
1    25.174999
2    24.112499
3    24.240000
4    24.632500
Name: Close, dtype: float64

In [10]:
# Drop Invalid Samples

df_2019 = df_2019.iloc[33:] # Because of moving averages and MACD line
df_2019 = df_2019[:-1]      # Because of shifting close price

df_2019.index = range(len(df_2019))

df_2019.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-02-22,24.077499,24.225,23.98,23.672501,22.293282,137123200,24.011677,24.0525,23.826,23.904,24.152417,51.381201,-0.058951,-0.120254
1,2016-02-23,24.1,24.125,23.637501,24.025,21.789339,127770400,24.033105,24.197,23.8975,23.896333,24.156,50.635208,-0.079591,-0.111143
2,2016-02-24,23.495001,24.094999,23.33,24.190001,22.113792,145022800,23.996119,24.0995,23.8895,23.867333,24.137083,49.222154,-0.068312,-0.101772
3,2016-02-25,24.012501,24.190001,23.8125,24.227501,22.265671,110330800,23.999074,23.9985,23.91725,23.894333,24.116833,50.485448,-0.046239,-0.089846
4,2016-02-26,24.299999,24.504999,24.145,24.172501,22.300192,115964400,24.018561,24.0235,23.9795,23.901167,24.090167,60.284714,-0.025592,-0.076247


In [11]:
# Split Data into Train, Valid and Test Sets

test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df_2019.shape[0] * (1-test_size))
valid_split_idx = int(df_2019.shape[0] * (1-(valid_size+test_size)))

train_df  = df_2019.loc[:valid_split_idx].copy()
valid_df  = df_2019.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df_2019.loc[test_split_idx+1:].copy()

In [12]:
# Drop unecessary columns

drop_cols = ['Date', 'Volume', 'Open', 'Low', 'High', 'Adj Close']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)

In [13]:
y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_valid = valid_df['Close'].copy()
X_valid = valid_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1211 entries, 0 to 1210
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   EMA_9        1211 non-null   float64
 1   SMA_5        1211 non-null   float64
 2   SMA_10       1211 non-null   float64
 3   SMA_15       1211 non-null   float64
 4   SMA_30       1211 non-null   float64
 5   RSI          1211 non-null   float64
 6   MACD         1211 non-null   float64
 7   MACD_signal  1211 non-null   float64
dtypes: float64(8)
memory usage: 75.8 KB


In [None]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [300, 400, 500, 600, 700],
    'learning_rate': [0.01, 0.05, 0.08, 0.10, 0.15],
    'max_depth': [5, 8, 10, 12, 15, 18],
    'gamma': [0.01, 0.02, 0.05, 0.08],
    'random_state': [42]
}

eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbosity = 0, booster='gblinear')
clf = GridSearchCV(model, parameters)

clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

In [None]:
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror', 
                         verbosity = 0, 
                         booster='gblinear')
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

In [None]:
import numpy as np

y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


predicted_prices = df_2019.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()

In [None]:
from sklearn.metrics import mean_squared_error

print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

# SHET AHAHAH IM SO PROUD

In [None]:
y_pred_df = pd.DataFrame(y_pred)

In [None]:
date_preds = pd.DataFrame(predicted_prices.Date)

In [None]:
result = pd.concat([predicted_prices.Date, y_pred_df], axis=1)

In [None]:
y_pred_df.to_csv("VOO_predictions.csv", index=False)

In [None]:
date_preds.to_csv("VOO_date_preds.csv", index=False)