In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/apple-aapl-historical-stock-data/HistoricalQuotes.csv


In [None]:
import matplotlib.pyplot as plt

# gather data
df = pd.read_csv('/kaggle/input/apple-aapl-historical-stock-data/HistoricalQuotes.csv')
df.columns = ['Date', 'Close', 'Volume', 'Open', 'High', 'Low']
df.head()

In [None]:
import yfinance as yf



# Preprocess Data

In [None]:
# Reverse order of dataframe
df = (df.iloc[::-1])
df = df.reset_index(drop = True)

# Convert date into datetime object, then into cyclical function using sin function
# Retain year as seperate feature

df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df['Year'] = pd.DatetimeIndex(df['Date']).year - 2010
df['DayOfYear'] = df['Date'].dt.dayofyear

temp = pd.DataFrame()
temp = df['DayOfYear']/df['DayOfYear'].max() * 2 * np.pi
temp = np.sin(temp) # Uses sin
print(temp)

df['DayOfYear'] = temp

# Remove dollar signs
df['Close'] = df['Close'].str.replace('$', '')
df['Open'] = df['Open'].str.replace('$', '')
df['High'] = df['High'].str.replace('$', '')
df['Low'] = df['Low'].str.replace('$', '')

df.head()

# Normalize volume between 0 and 1

df['Volume']=(df['Volume']-df['Volume'].min())/(df['Volume'].max()-df['Volume'].min())

In [None]:
# Create MACD Column

# Traditional MACD = 12 Day EMA - 26 Day EMA, EMA = Exponential Moving Average
# Using closing prices

#EMA
ema12 = df['Close'].ewm(span=12, min_periods=12)
ema12 = ema12.mean() # First 11 values are null
ema20 = df['Close'].ewm(span=26, min_periods=26)
ema20 = ema20.mean() # First 25 values are null

#Add MACD to dataframe
df['MACD'] = ema12 - ema20

df.head()

In [None]:
# Create Stochastic RSI Column
# https://stackoverflow.com/questions/30261541/slow-stochastic-implementation-in-python-pandas

k = 14 # days
d = 3

low_min  = df['Low'].rolling(window=k).min()
high_max = df['High'].rolling( window=k).max()

df['Close'] = df['Close'].astype('float64')
df['Stoch_k'] = 100 * (df['Close'] - low_min)/(high_max - low_min)
df['Stoch_d'] = df['Stoch_k'].rolling(window=d).mean()

In [None]:
# Create label(s)

df['Y1'] = df['Close'].copy().shift(periods=1) # Price after 1 day
df['Y7'] = df['Close'].copy().shift(periods=7) # Price after 7 days
df['Y30'] = df['Close'].copy().shift(periods=30) # Price after 30 days

# Remove missing data

df = df.dropna()
df = df.reset_index(drop = True)
df

In [None]:
# Create features and labels, testing with different features and labels
# X1 and X2 are paired with y1, y7, and y30
# X3 is paired with y30_2

X1 = df[['Close', 'Volume', 'Year', 'DayOfYear', 'MACD', 'Stoch_k', 'Stoch_d']] # Removed Open, High, Low
X2 = df[['Close', 'Volume', 'Open', 'High', 'Low', 'Year', 'DayOfYear', 'MACD', 'Stoch_k', 'Stoch_d']]
X3 = df[['Volume', 'Year', 'DayOfYear', 'MACD', 'Stoch_k', 'Stoch_d']] # Removed all price data

y1 = df['Y1']
y7 = df['Y7']
y30 = df['Y30']

y30_2 = df['Y30'] - df['Close'] # Difference between two closing prices

# Control, to see if technical indicators helped predict price or not
# 30 Day prediction

X_control = df[['Close', 'Volume', 'Open', 'High', 'Low', 'Year', 'DayOfYear']] # Removed Technical Indicators
y_control = df['Y30']

# Create Model

In [None]:
# X1 and y1: 1 DAY PREDICTION
# 90% train, 5% valid, 5% test
from sklearn import tree, metrics, datasets, model_selection

X1_train, X1_temp, y1_train, y1_temp = model_selection.train_test_split(X1,y1, test_size = 0.1)
X1_valid, X1_test, y1_valid, y1_test = model_selection.train_test_split(X1_temp, y1_temp, test_size = 0.5) 
print(X1_train.shape, X1_valid.shape, X1_test.shape)


model1 = tree.DecisionTreeRegressor(max_depth = 10, min_samples_split = 2) 
model1.fit(X1_train, y1_train)

y1_pred = model1.predict(X1_valid) # y1_pred 
print('Model1 1 Day MAE Validation Score:',metrics.mean_absolute_error(y1_valid, y1_pred))

In [None]:
# X1 and y7: 7 DAY PREDICTION
X1_train, X1_temp, y7_train, y7_temp = model_selection.train_test_split(X1,y7, test_size = 0.1)
X1_valid, X1_test, y7_valid, y2_test = model_selection.train_test_split(X1_temp, y7_temp, test_size = 0.5) 

model2 = tree.DecisionTreeRegressor(max_depth = 10, min_samples_split = 2) 
model2.fit(X1_train, y7_train)

y7_pred = model2.predict(X1_valid)
print('Model2 7 Day MAE Validation Score:',metrics.mean_absolute_error(y7_valid, y7_pred))

In [None]:
#X2 and y7: 7 DAY PREDICTION w/ high, low, open prices as features

X2_train, X2_temp, y7_train, y7_temp = model_selection.train_test_split(X2,y7, test_size = 0.1)
X2_valid, X2_test, y7_valid, y3_test = model_selection.train_test_split(X2_temp, y7_temp, test_size = 0.5) 

model3 = tree.DecisionTreeRegressor(max_depth = 10, min_samples_split = 2) 
model3.fit(X2_train, y7_train)

y7_pred = model3.predict(X2_valid)
print('Model3 7 Day MAE Validation Score:',metrics.mean_absolute_error(y7_valid, y7_pred))

plt.plot(y7_pred)
plt.show()

In [None]:
# X2 and y30 v.s. X3 and y30_2
# Both are 30 day predictions, but have different features
# The MAEs of both models should be comparable with each other... I think

X2_train, X2_temp, y30_train, y30_temp = model_selection.train_test_split(X2,y30, test_size = 0.1)
X2_valid, X2_test, y30_valid, y4_test = model_selection.train_test_split(X2_temp, y30_temp, test_size = 0.5) 

model4 = tree.DecisionTreeRegressor(criterion = 'mae', max_depth = 10, min_samples_split = 2) 
model4.fit(X2_train, y30_train)

y30_pred = model4.predict(X2_valid)

X3_train, X3_temp, y30_2_train, y30_2_temp = model_selection.train_test_split(X2,y30_2,test_size = 0.1)
X3_valid, X3_test, y30_2_valid, y5_test = model_selection.train_test_split(X2_temp, y30_2_temp, test_size = 0.5) 

model5 = tree.DecisionTreeRegressor(criterion = 'mae', max_depth = 10, min_samples_split = 2) 
model5.fit(X3_train, y30_2_train)

y30_2_pred = model5.predict(X3_valid)

In [None]:
X_Control = df[['Close', 'Volume', 'Open', 'High', 'Low', 'Year', 'DayOfYear']] # Removed Technical Indicators
y_control = df['Y30']

In [None]:
# Control model: 30 day prediction

X_control_train, X_control_temp, y_control_train, y_control_temp = model_selection.train_test_split(X_control, y_control, test_size = 0.1)
X_control_valid, X_control_test, y_control_valid, y_control_test = model_selection.train_test_split(X_control_temp, y_control_temp, test_size = 0.5) 

model6 = tree.DecisionTreeRegressor(criterion = 'mae', max_depth = 10, min_samples_split = 2) 
model6.fit(X_control_train, y_control_train)

y_control_pred = model6.predict(X_control_valid)



Tested hyperparameters:

max_depth: doesnt seem to have a significant effect from 1-1024

splitter: best > random

min_samples_split: 2, 4, 8 are best. tested powers of 2 until 2^6, then powers of 10 until 10^6

In [None]:
print('Model4 30 Day MAE Validation score: ',metrics.mean_absolute_error(y30_valid, y30_pred))
print('Model5 30 Day MAE Validation score: ',metrics.mean_absolute_error(y30_2_valid, y30_2_pred))
print('Control Model 30 Day MAE Validation score: ',metrics.mean_absolute_error(y_control_valid, y_control_pred))
plt.plot(y30_pred)
plt.plot(y_control_pred)
plt.show()

# X2 is blue, Control is orange

In [None]:
# Prediction of 30 day stock price relative to starting price
print('Model5 MAE Validation score: ',metrics.mean_absolute_error(y30_2_valid, y30_2_pred))
plt.plot(y30_2_pred)
plt.show()

In [None]:
model1_predictions = model1.predict(X1_test)
model2_predictions = model2.predict(X1_test)
model3_predictions = model3.predict(X2_test)
model4_predictions = model4.predict(X2_test)
model5_predictions = model5.predict(X3_test) # predicts price change instead of absolute price
control_model_predictions = model6.predict(X_control_test)
print('Model1 1 Day MAE Test Score:',metrics.mean_absolute_error(y1_test, model1_predictions))
print('Model2 7 Day MAE Test Score:',metrics.mean_absolute_error(y2_test, model2_predictions))
print('Model3 7 Day MAE Test Score:',metrics.mean_absolute_error(y3_test, model3_predictions))
print('Model4 30 Day MAE Test score: ',metrics.mean_absolute_error(y4_test, model4_predictions))
print('Model5 30 Day MAE Test score: ',metrics.mean_absolute_error(y5_test, model5_predictions))
print('Control 30 Day MAE Test score:',metrics.mean_absolute_error(y_control_test, control_model_predictions))