# Full Process

In [1]:
import pandas as pd

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
# Import data
df = pd.read_csv('files/datasets/AMZN.csv')

# Convert date into datetime data type
df['Date'] = pd.to_datetime(df['Date'])

# Get 2016 data only and fix the index
df_2019 = df[(df['Date'].dt.year >= 2016)].copy()
df_2019.index = range(len(df_2019))

In [5]:
df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-01-04,32.814499,32.886002,31.375500,31.849501,31.849501,186290000
1,2016-01-05,32.342999,32.345501,31.382500,31.689501,31.689501,116452000
2,2016-01-06,31.100000,31.989500,31.015499,31.632500,31.632500,106584000
3,2016-01-07,31.090000,31.500000,30.260500,30.396999,30.396999,141498000
4,2016-01-08,30.983000,31.207001,30.299999,30.352501,30.352501,110258000
...,...,...,...,...,...,...,...
1758,2022-12-27,84.970001,85.349998,83.000000,83.040001,83.040001,57284000
1759,2022-12-28,82.800003,83.480003,81.690002,81.820000,81.820000,58228600
1760,2022-12-29,82.870003,84.550003,82.550003,84.180000,84.180000,54995900
1761,2022-12-30,83.120003,84.050003,82.470001,84.000000,84.000000,62330000


In [6]:
# Create Moving Average Indicators

df_2019['EMA_9'] = df_2019['Close'].ewm(9).mean().shift()
df_2019['SMA_5'] = df_2019['Close'].rolling(5).mean().shift()
df_2019['SMA_10'] = df_2019['Close'].rolling(10).mean().shift()
df_2019['SMA_15'] = df_2019['Close'].rolling(15).mean().shift()
df_2019['SMA_30'] = df_2019['Close'].rolling(30).mean().shift()

In [7]:
df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30
0,2016-01-04,32.814499,32.886002,31.375500,31.849501,31.849501,186290000,,,,,
1,2016-01-05,32.342999,32.345501,31.382500,31.689501,31.689501,116452000,31.849501,,,,
2,2016-01-06,31.100000,31.989500,31.015499,31.632500,31.632500,106584000,31.765290,,,,
3,2016-01-07,31.090000,31.500000,30.260500,30.396999,30.396999,141498000,31.716290,,,,
4,2016-01-08,30.983000,31.207001,30.299999,30.352501,30.352501,110258000,31.332664,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1758,2022-12-27,84.970001,85.349998,83.000000,83.040001,83.040001,57284000,89.185616,85.184,87.685,88.267333,91.802666
1759,2022-12-28,82.800003,83.480003,81.690002,81.820000,81.820000,58228600,88.571055,84.808,86.934,87.736000,91.210999
1760,2022-12-29,82.870003,84.550003,82.550003,84.180000,84.180000,54995900,87.895949,84.134,85.867,87.307333,90.655333
1761,2022-12-30,83.120003,84.050003,82.470001,84.000000,84.000000,62330000,87.524354,83.616,85.127,87.022000,90.163333


In [9]:
# Create RSI Indicator
def relative_strength_idx(df, n=14):
    close = df['Close']
    delta = close.diff()
    delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

df_2019['RSI'] = relative_strength_idx(df_2019).fillna(0)
df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI
0,2016-01-04,32.814499,32.886002,31.375500,31.849501,31.849501,186290000,,,,,,
1,2016-01-05,32.342999,32.345501,31.382500,31.689501,31.689501,116452000,31.849501,,,,,0.000000
2,2016-01-06,31.100000,31.989500,31.015499,31.632500,31.632500,106584000,31.765290,,,,,0.000000
3,2016-01-07,31.090000,31.500000,30.260500,30.396999,30.396999,141498000,31.716290,,,,,0.000000
4,2016-01-08,30.983000,31.207001,30.299999,30.352501,30.352501,110258000,31.332664,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,2022-12-27,84.970001,85.349998,83.000000,83.040001,83.040001,57284000,89.185616,85.184,87.685,88.267333,91.802666,38.589575
1759,2022-12-28,82.800003,83.480003,81.690002,81.820000,81.820000,58228600,88.571055,84.808,86.934,87.736000,91.210999,36.073826
1760,2022-12-29,82.870003,84.550003,82.550003,84.180000,84.180000,54995900,87.895949,84.134,85.867,87.307333,90.655333,37.309752
1761,2022-12-30,83.120003,84.050003,82.470001,84.000000,84.000000,62330000,87.524354,83.616,85.127,87.022000,90.163333,39.044346


In [10]:
# Create MACD Indicator

EMA_12 = pd.Series(df_2019['Close'].ewm(span=12, min_periods=12).mean())
EMA_26 = pd.Series(df_2019['Close'].ewm(span=26, min_periods=26).mean())
df_2019['MACD'] = pd.Series(EMA_12 - EMA_26)
df_2019['MACD_signal'] = pd.Series(df_2019.MACD.ewm(span=9, min_periods=9).mean())

df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-01-04,32.814499,32.886002,31.375500,31.849501,31.849501,186290000,,,,,,,,
1,2016-01-05,32.342999,32.345501,31.382500,31.689501,31.689501,116452000,31.849501,,,,,0.000000,,
2,2016-01-06,31.100000,31.989500,31.015499,31.632500,31.632500,106584000,31.765290,,,,,0.000000,,
3,2016-01-07,31.090000,31.500000,30.260500,30.396999,30.396999,141498000,31.716290,,,,,0.000000,,
4,2016-01-08,30.983000,31.207001,30.299999,30.352501,30.352501,110258000,31.332664,,,,,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,2022-12-27,84.970001,85.349998,83.000000,83.040001,83.040001,57284000,89.185616,85.184,87.685,88.267333,91.802666,38.589575,-3.426447,-3.253638
1759,2022-12-28,82.800003,83.480003,81.690002,81.820000,81.820000,58228600,88.571055,84.808,86.934,87.736000,91.210999,36.073826,-3.574356,-3.317782
1760,2022-12-29,82.870003,84.550003,82.550003,84.180000,84.180000,54995900,87.895949,84.134,85.867,87.307333,90.655333,37.309752,-3.461244,-3.346474
1761,2022-12-30,83.120003,84.050003,82.470001,84.000000,84.000000,62330000,87.524354,83.616,85.127,87.022000,90.163333,39.044346,-3.347538,-3.346687


In [11]:
# Shift Labels

df_2019['Close'] = df_2019['Close'].shift(-1)
df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-01-04,32.814499,32.886002,31.375500,31.689501,31.849501,186290000,,,,,,,,
1,2016-01-05,32.342999,32.345501,31.382500,31.632500,31.689501,116452000,31.849501,,,,,0.000000,,
2,2016-01-06,31.100000,31.989500,31.015499,30.396999,31.632500,106584000,31.765290,,,,,0.000000,,
3,2016-01-07,31.090000,31.500000,30.260500,30.352501,30.396999,141498000,31.716290,,,,,0.000000,,
4,2016-01-08,30.983000,31.207001,30.299999,30.886999,30.352501,110258000,31.332664,,,,,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,2022-12-27,84.970001,85.349998,83.000000,81.820000,83.040001,57284000,89.185616,85.184,87.685,88.267333,91.802666,38.589575,-3.426447,-3.253638
1759,2022-12-28,82.800003,83.480003,81.690002,84.180000,81.820000,58228600,88.571055,84.808,86.934,87.736000,91.210999,36.073826,-3.574356,-3.317782
1760,2022-12-29,82.870003,84.550003,82.550003,84.000000,84.180000,54995900,87.895949,84.134,85.867,87.307333,90.655333,37.309752,-3.461244,-3.346474
1761,2022-12-30,83.120003,84.050003,82.470001,85.820000,84.000000,62330000,87.524354,83.616,85.127,87.022000,90.163333,39.044346,-3.347538,-3.346687


In [12]:
# Drop Invalid Samples

df_2019 = df_2019.iloc[33:] # Because of moving averages and MACD line
df_2019 = df_2019[:-1]      # Because of shifting close price

df_2019.index = range(len(df_2019))

df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-02-22,27.110001,28.032499,27.054001,27.646999,27.975000,111332000,26.741336,26.2218,25.443900,26.233400,27.999600,46.148433,-0.552951,-0.842483
1,2016-02-23,27.777500,27.845501,27.266500,27.702000,27.646999,81016000,26.868232,26.7460,25.730750,26.141733,27.918867,50.230005,-0.429432,-0.749935
2,2016-02-24,27.287500,27.713499,26.657499,27.757500,27.702000,124634000,26.948108,27.0644,26.054950,26.068833,27.828683,57.060310,-0.323733,-0.656685
3,2016-02-25,27.775999,27.969500,27.264500,27.761499,27.757500,90510000,27.025235,27.2638,26.414800,26.075300,27.722517,55.955607,-0.233121,-0.565721
4,2016-02-26,28.006001,28.125000,27.658501,27.625999,27.761499,97540000,27.099977,27.5653,26.738150,26.155567,27.617950,71.318429,-0.159234,-0.479694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2022-12-23,83.250000,85.779999,82.930000,83.040001,85.250000,57433700,89.622907,85.7060,88.068999,88.859333,92.181999,37.681774,-3.312041,-3.210436
1725,2022-12-27,84.970001,85.349998,83.000000,81.820000,83.040001,57284000,89.185616,85.1840,87.685000,88.267333,91.802666,38.589575,-3.426447,-3.253638
1726,2022-12-28,82.800003,83.480003,81.690002,84.180000,81.820000,58228600,88.571055,84.8080,86.934000,87.736000,91.210999,36.073826,-3.574356,-3.317782
1727,2022-12-29,82.870003,84.550003,82.550003,84.000000,84.180000,54995900,87.895949,84.1340,85.867000,87.307333,90.655333,37.309752,-3.461244,-3.346474


# ABOVE IS WELL

---

In [None]:
# Split Data into Train, Valid and Test Sets

test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df_2019.shape[0] * (1-test_size))
valid_split_idx = int(df_2019.shape[0] * (1-(valid_size+test_size)))

train_df  = df_2019.loc[:valid_split_idx].copy()
valid_df  = df_2019.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df_2019.loc[test_split_idx+1:].copy()

In [None]:
# Drop unecessary columns

drop_cols = ['Date', 'Volume', 'Open', 'Low', 'High', 'Adj Close']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)

In [None]:
y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_valid = valid_df['Close'].copy()
X_valid = valid_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

In [None]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [300, 400, 500, 600, 700],
    'learning_rate': [0.01, 0.05, 0.08, 0.10, 0.15],
    'max_depth': [5, 8, 10, 12, 15, 18],
    'gamma': [0.01, 0.02, 0.05, 0.08],
    'random_state': [42]
}

eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbosity = 0, booster='gblinear')
clf = GridSearchCV(model, parameters)

clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

In [None]:
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror', 
                         verbosity = 0, 
                         booster='gblinear')
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

In [None]:
import numpy as np

y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


predicted_prices = df_2019.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()

In [None]:
from sklearn.metrics import mean_squared_error

print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

# SHET AHAHAH IM SO PROUD

In [None]:
y_pred_df = pd.DataFrame(y_pred)

In [None]:
date_preds = pd.DataFrame(predicted_prices.Date)

In [None]:
result = pd.concat([predicted_prices.Date, y_pred_df], axis=1)

In [None]:
y_pred_df.to_csv("AMZN_predictions.csv", index=False)

In [None]:
date_preds.to_csv("AMZN_date_preds.csv", index=False)