# Full Process

In [1]:
import pandas as pd

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

----

### Process 1:

In [None]:
def get_data(path, year):
    import pandas as pd
    df = pd.read_csv(path)
    
    # Convert date into datetime data type
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Get 2019 data only and fix the index
    df_idx = df[(df['Date'].dt.year >= year)].copy()
    df_idx.index = range(len(df_idx))
    
    return df_idx

df = get_data('files/datasets/AMZN.csv', 2016)

In [None]:
df_AAPL = get_data('files/datasets/AAPL.csv', 2016)
df_AMZN = get_data('files/datasets/AMZN.csv', 2016)
df_MSFT = get_data('files/datasets/MSFT.csv', 2016)
df_VOO = get_data('files/datasets/VOO.csv', 2016)

In [16]:
for i in ['AAPL', 'AMZN', 'MSFT', 'VOO']:
    print(f'files/datasets/{i}.csv')

files/datasets/AAPL.csv
files/datasets/AMZN.csv
files/datasets/MSFT.csv
files/datasets/VOO.csv


In [None]:
datasets = ['AAPL', 'AMZN', 'MSFT', 'VOO']

for symbol in datasets:
    filename = f'files/datasets/{symbol}.csv'
    df = get_data(filename, 2016)
    eval(f'df_{symbol} = df')


### Process 2:

In [3]:
def ma_indicators(dataframe):
    
    # Create exponential moving average
    dataframe['EMA_9'] = dataframe['Close'].ewm(9).mean().shift()
    
    # Create 4 simple moving average using the values below
    for n in [5, 10, 15, 30]:
        dataframe[f'SMA_{n}'] = dataframe['Close'].rolling(n).mean().shift()
        
    return dataframe

#df = ma_indicators(df)

### Process 3:

In [6]:
def add_rsi_indicator(dataframe, n=14):
    def relative_strength_idx():
        close = dataframe['Close']
        delta = close.diff()
        delta = delta[1:]
        pricesUp = delta.copy()
        pricesDown = delta.copy()
        pricesUp[pricesUp < 0] = 0
        pricesDown[pricesDown > 0] = 0
        rollUp = pricesUp.rolling(n).mean()
        rollDown = pricesDown.abs().rolling(n).mean()
        rs = rollUp / rollDown
        rsi = 100.0 - (100.0 / (1.0 + rs))
        return rsi.fillna(0)

    dataframe['RSI'] = relative_strength_idx()
    return dataframe

### Process 4:

In [8]:
def macd_indicator(dataframe):
    
    # Define the two moving average that MACD will use
    EMA_12 = pd.Series(dataframe['Close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(dataframe['Close'].ewm(span=26, min_periods=26).mean())
    
    # Create MACD Indicator
    dataframe['MACD'] = pd.Series(EMA_12 - EMA_26)
    dataframe['MACD_signal'] = pd.Series(dataframe.MACD.ewm(span=9, min_periods=9).mean())
    
    return dataframe

### Process 5:

In [10]:
def label_shift(dataframe):
    dataframe['Close'] = dataframe['Close'].shift(-1)
    return dataframe

### Process 6:

In [12]:
def drop_invalid_samples(dataframe):
    dataframe = dataframe.iloc[33:] # Because of moving averages and MACD line
    dataframe = dataframe[:-1]      # Because of shifting close price
    
    dataframe.index = range(len(dataframe))
    
    return dataframe

In [13]:
pipe_clean = (df.pipe(ma_indicators)
                .pipe(add_rsi_indicator)
                .pipe(macd_indicator)
                .pipe(label_shift)
                .pipe(drop_invalid_samples)
             )
pipe_clean

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-02-22,27.110001,28.032499,27.054001,27.702000,27.975000,111332000,26.852347,26.7460,25.73075,26.141733,27.918867,50.230005,-0.404848,-0.710852
1,2016-02-23,27.777500,27.845501,27.266500,27.757500,27.646999,81016000,26.934086,27.0644,26.05495,26.068833,27.828683,57.060310,-0.301058,-0.619034
2,2016-02-24,27.287500,27.713499,26.657499,27.761499,27.702000,124634000,27.012849,27.2638,26.41480,26.075300,27.722517,55.955607,-0.212231,-0.530028
3,2016-02-25,27.775999,27.969500,27.264500,27.625999,27.757500,90510000,27.089030,27.5653,26.73815,26.155567,27.617950,71.318429,-0.139992,-0.446265
4,2016-02-26,28.006001,28.125000,27.658501,28.952000,27.761499,97540000,27.157669,27.7686,26.99520,26.218800,27.573650,78.449010,-0.092132,-0.371318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2022-12-23,83.250000,85.779999,82.930000,81.820000,85.250000,57433700,89.185616,85.1840,87.68500,88.267333,91.802666,38.589575,-3.426447,-3.253638
1725,2022-12-27,84.970001,85.349998,83.000000,84.180000,83.040001,57284000,88.571055,84.8080,86.93400,87.736000,91.210999,36.073826,-3.574356,-3.317782
1726,2022-12-28,82.800003,83.480003,81.690002,84.000000,81.820000,58228600,87.895949,84.1340,85.86700,87.307333,90.655333,37.309752,-3.461244,-3.346474
1727,2022-12-29,82.870003,84.550003,82.550003,85.820000,84.180000,54995900,87.524354,83.6160,85.12700,87.022000,90.163333,39.044346,-3.347538,-3.346687


# STOP HERE

----

In [14]:
test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(pipe_clean.shape[0] * (1-test_size))
valid_split_idx = int(pipe_clean.shape[0] * (1-(valid_size+test_size)))

train_df  = pipe_clean.loc[:valid_split_idx].copy()
valid_df  = pipe_clean.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = pipe_clean.loc[test_split_idx+1:].copy()

In [4]:


new_data = label_shift(new_data)

In [6]:
drop_invalid_samples(new_data)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-02-22,24.077499,24.225000,23.980000,23.672501,22.293282,137123200,24.011677,24.052500,23.826000,23.904000,24.152417,51.381201,-0.058951,-0.120254
1,2016-02-23,24.100000,24.125000,23.637501,24.025000,21.789339,127770400,24.033105,24.197000,23.897500,23.896333,24.156000,50.635208,-0.079591,-0.111143
2,2016-02-24,23.495001,24.094999,23.330000,24.190001,22.113792,145022800,23.996119,24.099500,23.889500,23.867333,24.137083,49.222154,-0.068312,-0.101772
3,2016-02-25,24.012501,24.190001,23.812500,24.227501,22.265671,110330800,23.999074,23.998500,23.917250,23.894333,24.116833,50.485448,-0.046239,-0.089846
4,2016-02-26,24.299999,24.504999,24.145000,24.172501,22.300192,115964400,24.018561,24.023500,23.979500,23.901167,24.090167,60.284714,-0.025592,-0.076247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2022-12-23,130.919998,132.419998,129.639999,130.029999,131.860001,63814900,139.766701,133.371997,137.869000,139.975334,144.147332,26.261623,-3.670694,-2.570162
1725,2022-12-27,131.380005,131.410004,128.720001,126.040001,130.029999,69007800,138.976031,132.841998,136.839000,138.912001,143.646999,27.960271,-3.936230,-2.843375
1726,2022-12-28,129.669998,131.029999,125.870003,129.610001,126.040001,85438400,138.081427,132.373999,135.392999,137.805334,142.991333,26.152342,-4.417705,-3.158241
1727,2022-12-29,127.989998,130.479996,127.730003,129.929993,129.610001,75703700,136.877285,131.121999,133.449999,136.680667,142.249999,30.302111,-4.459798,-3.418553


In [7]:
# Shift Labels

df_2019['Close'] = df_2019['Close'].shift(-1)
df_2019['Close'].head()

0    25.174999
1    24.112499
2    24.240000
3    24.632500
4    24.990000
Name: Close, dtype: float64

In [9]:
# Drop Invalid Samples

df_2019 = df_2019.iloc[33:] # Because of moving averages and MACD line
df_2019 = df_2019[:-1]      # Because of shifting close price

df_2019.index = range(len(df_2019))

df_2019

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal
0,2016-04-08,27.227501,27.442499,27.042500,27.610001,25.004004,94326800,26.751381,27.521000,27.187750,26.966500,26.122167,59.500705,0.594005,0.617386
1,2016-04-11,27.242500,27.652500,27.207500,28.010000,25.086843,117630000,26.792779,27.454500,27.262500,27.014167,26.221334,60.528070,0.561695,0.606247
2,2016-04-12,27.334999,27.625000,27.165001,28.025000,25.413607,108929200,26.839037,27.349500,27.358250,27.065833,26.322250,62.093632,0.558166,0.596631
3,2016-04-13,27.700001,28.084999,27.700001,27.462500,25.781784,133029200,26.916187,27.381000,27.427250,27.141333,26.436833,68.029294,0.580810,0.593466
4,2016-04-14,27.905001,28.097500,27.832500,26.870001,25.795591,101895600,27.025637,27.435000,27.489250,27.230000,26.532750,70.106306,0.593118,0.593397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,2022-12-22,134.350006,134.559998,130.300003,130.029999,132.229996,77852100,140.604112,134.225998,138.911000,141.047334,144.235332,25.595210,-3.464708,-2.295029
1691,2022-12-23,130.919998,132.419998,129.639999,126.040001,131.860001,63814900,139.766701,133.371997,137.869000,139.975334,144.147332,26.261623,-3.670694,-2.570162
1692,2022-12-27,131.380005,131.410004,128.720001,129.610001,130.029999,69007800,138.976031,132.841998,136.839000,138.912001,143.646999,27.960271,-3.936230,-2.843375
1693,2022-12-28,129.669998,131.029999,125.870003,129.929993,126.040001,85438400,138.081427,132.373999,135.392999,137.805334,142.991333,26.152342,-4.417705,-3.158241


In [12]:
# Split Data into Train, Valid and Test Sets

test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df_2019.shape[0] * (1-test_size))
valid_split_idx = int(df_2019.shape[0] * (1-(valid_size+test_size)))

train_df  = df_2019.loc[:valid_split_idx].copy()
valid_df  = df_2019.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df_2019.loc[test_split_idx+1:].copy()

In [13]:
# Drop unecessary columns

drop_cols = ['Date', 'Volume', 'Open', 'Low', 'High', 'Adj Close']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)

In [14]:
y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_valid = valid_df['Close'].copy()
X_valid = valid_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1211 entries, 0 to 1210
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   EMA_9        1211 non-null   float64
 1   SMA_5        1211 non-null   float64
 2   SMA_10       1211 non-null   float64
 3   SMA_15       1211 non-null   float64
 4   SMA_30       1211 non-null   float64
 5   RSI          1211 non-null   float64
 6   MACD         1211 non-null   float64
 7   MACD_signal  1211 non-null   float64
dtypes: float64(8)
memory usage: 75.8 KB


In [15]:
%%time

import xgboost as xgb
from sklearn.model_selection import GridSearchCV


parameters = {
    'n_estimators': [300, 400, 500, 600, 700],
    'learning_rate': [0.01, 0.05, 0.08, 0.10, 0.15],
    'max_depth': [5, 8, 10, 12, 15, 18],
    'gamma': [0.01, 0.02, 0.05, 0.08],
    'random_state': [42]
}

eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbosity = 0, booster='gblinear')
clf = GridSearchCV(model, parameters)

clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

Best params: {'gamma': 0.01, 'learning_rate': 0.15, 'max_depth': 18, 'n_estimators': 500, 'random_state': 42}
Best validation score = 0.965298498274034
CPU times: total: 16min 48s
Wall time: 4min 22s


In [16]:
%%time

model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror', 
                         verbosity = 0, 
                         booster='gblinear')
model.fit(X_train, y_train, eval_set=eval_set, verbose=0)

CPU times: total: 625 ms
Wall time: 228 ms


In [17]:
import numpy as np

y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')

y_true = [175.639999 176.279999 180.330002 179.289993 179.380005]
y_pred = [173.58078 174.20074 174.7468  176.54182 177.77231]


In [18]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


predicted_prices = df_2019.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()

ModuleNotFoundError: No module named 'plotly'

In [19]:
from sklearn.metrics import mean_squared_error

print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

mean_squared_error = 21.357888413124407


# SHET AHAHAH IM SO PROUD

In [None]:
y_pred_df = pd.DataFrame(y_pred)

In [None]:
date_preds = pd.DataFrame(predicted_prices.Date)

In [None]:
result = pd.concat([predicted_prices.Date, y_pred_df], axis=1)

In [None]:
y_pred_df.to_csv("AAPL_predictions.csv", index=False)

In [None]:
date_preds.to_csv("AAPL_date_preds.csv", index=False)