In [28]:
# Import libraries
import os
import sys
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report

#from config import ACCOUNT_NUMBER, ACCOUNT_PASSWORD, CONSUMER_ID, REDIRECT_URI

In [29]:
import yfinance as yf
ticker = 'AAPL'
price_data = yf.download(ticker, start='2010-01-01', end='2020-01-01')
price_data.insert(0, 'symbol', ticker)
price_data = price_data.rename(columns={'Open':'open', 'High':'high', 'Low':'low', 'Close':'close', 'Volume':'volume'})


[*********************100%%**********************]  1 of 1 completed


In [30]:
price_data['change_in_price'] = price_data['close'].diff()
price_data

Unnamed: 0_level_0,symbol,open,high,low,close,Adj Close,volume,change_in_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,AAPL,7.622500,7.660714,7.585000,7.643214,6.461976,493729600,
2010-01-05,AAPL,7.664286,7.699643,7.616071,7.656429,6.473147,601904800,0.013215
2010-01-06,AAPL,7.656429,7.686786,7.526786,7.534643,6.370185,552160000,-0.121786
2010-01-07,AAPL,7.562500,7.571429,7.466071,7.520714,6.358409,477131200,-0.013929
2010-01-08,AAPL,7.510714,7.571429,7.466429,7.570714,6.400681,447610800,0.050000
...,...,...,...,...,...,...,...,...
2019-12-24,AAPL,71.172501,71.222504,70.730003,71.067497,69.054337,48478800,0.067497
2019-12-26,AAPL,71.205002,72.495003,71.175003,72.477501,70.424400,93121200,1.410004
2019-12-27,AAPL,72.779999,73.492500,72.029999,72.449997,70.397682,146266000,-0.027504
2019-12-30,AAPL,72.364998,73.172501,71.305000,72.879997,70.815491,144114400,0.430000


In [31]:
# Calculate the 14 day RSI
n = 14

# First make a copy of the data frame twice
up_df, down_df = price_data[['symbol','change_in_price']].copy(), price_data[['symbol','change_in_price']].copy()

# For up days, if the change is less than 0 set to 0.
up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

# For down days, if the change is greater than 0 set to 0.
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

# We need change in price to be absolute.
down_df['change_in_price'] = down_df['change_in_price'].abs()

# Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
ewma_up = up_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
ewma_down = down_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

# Calculate the Relative Strength
relative_strength = ewma_up / ewma_down

# Calculate the Relative Strength Index
relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

# Add the info to the data frame.
price_data['down_days'] = down_df['change_in_price']
price_data['up_days'] = up_df['change_in_price']
price_data['RSI'] = relative_strength_index

# Display the head.
#price_data.head(30)

In [32]:
# Calculate the Stochastic Oscillator
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate the Stochastic Oscillator.
k_percent = 100 * ((price_data['close'] - low_14) / (high_14 - low_14))

# Add the info to the data frame.
price_data['low_14'] = low_14
price_data['high_14'] = high_14
price_data['k_percent'] = k_percent

# Display the head.
#price_data.head(30)

In [33]:
# Calculate the Williams %R
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate William %R indicator.
r_percent = ((high_14 - price_data['close']) / (high_14 - low_14)) * - 100

# Add the info to the data frame.
price_data['r_percent'] = r_percent

# Display the head.
#price_data.head(30)

In [34]:
# Calculate the MACD
ema_26 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculate the EMA
ema_9_macd = macd.ewm(span = 9).mean()

# Store the data in the data frame.
price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# Print the head.
#price_data.head(30)

In [35]:
# Calculate the Price Rate of Change
n = 9

# Calculate the Rate of Change in the Price, and store it in the Data Frame.
price_data['Price_Rate_Of_Change'] = price_data.groupby('symbol')['close'].transform(lambda x: x.pct_change(periods = n))

# Print the first 30 rows
price_data.head(1)



Unnamed: 0_level_0,symbol,open,high,low,close,Adj Close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-04,AAPL,7.6225,7.660714,7.585,7.643214,6.461976,493729600,,,,,,,,,0.0,0.0,


In [36]:
def obv(group):

    # Grab the volume and close column.
    volume = group['volume']
    change = group['close'].diff()

    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    price_data['On Balance Volume'] = obv_values

obv_groups = price_data.groupby('symbol').apply(obv)

price_data
        
# display the data frame.
#price_data.head(30)

Unnamed: 0_level_0,symbol,open,high,low,close,Adj Close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010-01-04,AAPL,7.622500,7.660714,7.585000,7.643214,6.461976,493729600,,,,,,,,,0.000000,0.000000,,0
2010-01-05,AAPL,7.664286,7.699643,7.616071,7.656429,6.473147,601904800,0.013215,0.000000,0.013215,100.000000,,,,,0.000296,0.000165,,601904800
2010-01-06,AAPL,7.656429,7.686786,7.526786,7.534643,6.370185,552160000,-0.121786,0.121786,0.000000,8.595610,,,,,-0.003399,-0.001296,,49744800
2010-01-07,AAPL,7.562500,7.571429,7.466071,7.520714,6.358409,477131200,-0.013929,0.013929,0.000000,7.670347,,,,,-0.005499,-0.002720,,-427386400
2010-01-08,AAPL,7.510714,7.571429,7.466429,7.570714,6.400681,447610800,0.050000,0.000000,0.050000,36.141065,,,,,-0.004535,-0.003260,,20224400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,AAPL,71.172501,71.222504,70.730003,71.067497,69.054337,48478800,0.067497,0.000000,0.067497,80.321854,65.682503,71.222504,97.202051,-2.797949,1.656653,1.501481,0.049858,14213572800
2019-12-26,AAPL,71.205002,72.495003,71.175003,72.477501,70.424400,93121200,1.410004,0.000000,1.410004,86.309094,66.227501,72.495003,99.720753,-0.279247,1.781618,1.557508,0.067966,14306694000
2019-12-27,AAPL,72.779999,73.492500,72.029999,72.449997,70.397682,146266000,-0.027504,0.027504,0.000000,85.722067,66.227501,73.492500,85.650331,-14.349669,1.857027,1.617412,0.053244,14160428000
2019-12-30,AAPL,72.364998,73.172501,71.305000,72.879997,70.815491,144114400,0.430000,0.000000,0.430000,87.282434,66.464996,73.492500,91.284202,-8.715798,1.929248,1.679779,0.041664,14304542400


In [37]:
# Create a column we wish to predict
'''
    In this case, let's create an output column that will be 1 if the closing price at time 't' is greater than 't-1' and 0 otherwise.
    In other words, if the today's closing price is greater than yesterday's closing price it would be 1.
'''

# Group by the `Symbol` column, then grab the `Close` column.
close_groups = price_data.groupby('symbol')['close']

# Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
close_groups = close_groups.transform(lambda x : np.sign(x.diff()))

# add the data to the main dataframe.
price_data['Prediction'] = close_groups

# for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
price_data.loc[price_data['Prediction'] == 0.0] = 1.0

# print the head
#price_data.head(5)

# OPTIONAL CODE: Dump the data frame to a CSV file to examine the data yourself.
# price_data.to_csv('final_metrics.csv')

In [38]:
# We need to remove all rows that have an NaN value.
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
#price_data.head()

Before NaN Drop we have 2516 rows and 20 columns
After NaN Drop we have 2503 rows and 20 columns


In [39]:
# Grab our X & Y Columns.
X_Cols = price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD','On Balance Volume']]
Y_Cols = price_data['Prediction']

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X_Cols, Y_Cols, random_state = 0)

# Create a Random Forest Classifier
rand_frst_clf = RandomForestClassifier(n_estimators = 100, oob_score = True, criterion = "gini", random_state = 0)

# Fit the data to the model
rand_frst_clf.fit(X_train, y_train)

# Make predictions
y_pred = rand_frst_clf.predict(X_test)

In [40]:
#train = price_data
#valid = data[training_data_len:]
#valid['Predictions'] = y_pred

In [41]:


#plt.figure(figsize = (16,8))
#plt.title('Model')  
#plt.xlabel('Date', fontsize=18 )
#plt.ylabel('Close Price USD $', fontsize=18)
#plt.plot(train['Close'])
#plt.plot(valid[['Close', 'Predictions']])
#plt.legend(['Train', 'Val', 'Predictions'], loc = 'lower right')
#plt.show()

In [42]:
# Print the Accuracy of our Model.
print('Correct Prediction (%): ', accuracy_score(y_test, rand_frst_clf.predict(X_test), normalize = True) * 100.0)

Correct Prediction (%):  69.32907348242811


In [43]:
# Define the traget names
target_names = ['Down Day', 'Up Day']

# Build a classifcation report
report = classification_report(y_true = y_test, y_pred = y_pred, target_names = target_names, output_dict = True)

# Add it to a data frame, transpose it for readability.
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
Down Day,0.68932,0.68932,0.68932,309.0
Up Day,0.697161,0.697161,0.697161,317.0
accuracy,0.693291,0.693291,0.693291,0.693291
macro avg,0.693241,0.693241,0.693241,626.0
weighted avg,0.693291,0.693291,0.693291,626.0


In [44]:
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ConfusionMatrix

rf_matrix = confusion_matrix(y_test, y_pred)

true_negatives = rf_matrix[0][0]
false_negatives = rf_matrix[1][0]
true_positives = rf_matrix[1][1]
false_positives = rf_matrix[0][1]

accuracy = (true_negatives + true_positives) / (true_negatives + true_positives + false_negatives + false_positives)
percision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
specificity = true_negatives / (true_negatives + false_positives)

print('Accuracy: {}'.format(float(accuracy)))
print('Percision: {}'.format(float(percision)))
print('Recall: {}'.format(float(recall)))
print('Specificity: {}'.format(float(specificity)))

#disp = ConfusionMatrix(rand_frst_clf, X_test, y_test, display_labels = ['Down Day', 'Up Day'], normalize = 'true', cmap=plt.cm.Blues)
#disp.ax_.set_title('Confusion Matrix - Normalized')
#plt.show()

Accuracy: 0.6932907348242812
Percision: 0.6971608832807571
Recall: 0.6971608832807571
Specificity: 0.6893203883495146
