# Objective

The objective of this notebook is to transfer and edit all functions of `spy_stock_eda.ipynb` and build them into a single class. 

In [38]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller

from datetime import datetime, timedelta

import plotly.graph_objects as go
import plotly.express as px

import sys
sys.path.append('../')

import src.tda_api_tools as tda

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', category=UserWarning)

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

%matplotlib inline

## Test Data

In [39]:
# Get daily closing price of SPY
symbol = "SPY"
file_path = "../data/{}_daily01.csv".format(symbol)
data_df = pd.read_csv(file_path, index_col="datetime")
data_df.index = pd.DatetimeIndex(data_df.index)
data_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-10-14 01:00:00,200.18,200.87,198.94,199.29,99106161
2015-10-15 01:00:00,200.08,202.36,199.64,202.35,134142195
2015-10-16 01:00:00,202.83,203.29,201.92,203.27,114580052
2015-10-19 01:00:00,202.5,203.37,202.13,203.37,76523897
2015-10-20 01:00:00,202.85,203.84,202.5471,203.09,78448484


## Data Manipulation


In [40]:
def moving_averages(input_df, days_list:list = [10, 50], col:str="close"):
    """
    This function returns a dataframe output_df with specified col and its moving averages per specified by days_list
    
    Input
    ========
    input_df: Pandas DatafFrame of stock prices. It is recommended that index is datetime and the columns contain "open, high, low, and close"
    
    days_list: list of integers to create moving averages. DEFAULT = [10, 50]
    
    col: name of the columns to create MA from the input_df. This must be part of input_df.columns. DEFAULT = "close"
    
    Output
    ========
    output_df: A pandas dataframe with specified col and its moving averages per specified by days_list
    
    """
    output_df = input_df.copy() # to not disturb original data
    out_ma_columns = [col] #intial lize output column list with input column
    
    for day in days_list:
        column = f"MA({day})"
        output_df[column] = output_df[col].rolling(day).mean() #make rolling/moving averages
        out_ma_columns.append(column) #build output column list
    
    return output_df[out_ma_columns].dropna() #output df with no errors

In [41]:
moving_averages(data_df, [10, 100], "close")

Unnamed: 0_level_0,close,MA(10),MA(100)
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-03-08 01:00:00,198.40,197.36942,200.046192
2016-03-09 01:00:00,199.38,197.98742,200.047092
2016-03-10 01:00:00,199.54,198.38742,200.018992
2016-03-11 01:00:00,202.76,199.15450,200.013892
2016-03-14 01:00:00,202.50,200.04850,200.005192
...,...,...,...
2020-12-11 01:00:00,366.30,367.05500,343.327100
2020-12-14 01:00:00,364.66,367.31500,343.744100
2020-12-15 01:00:00,369.59,367.67200,344.231200
2020-12-16 01:00:00,370.17,368.01000,344.700700


In [49]:
def differences (input_df, lags=1):
    """
    This function returns a dataframe output_df with lags of all columns on dataframe specified by orders of lags.
    
    Input
    ========
    input_df: Pandas DatafFrame of stock prices. It is recommended that index is datetime and the columns contain "open, high, low, and close"
    
    lags: # of lags DEFAULT = 1
    
    Output
    ========
    output_df: A pandas dataframe with lags of all columns
    
    """
    output_df = input_df.copy() #to not disturb input data
    input_columns = output_df.columns
    
    # for all lags
    for i in range(1, lags+1):
        for col in input_columns:
            col_name = "{}_lag_{}".format(col, i)
            output_df[col_name] = output_df[col].diff(i)

    return output_df.dropna()

In [50]:
differences(data_df)

Unnamed: 0_level_0,open,high,low,close,volume,open_lag_1,high_lag_1,low_lag_1,close_lag_1,volume_lag_1
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-10-15 01:00:00,200.08,202.3600,199.6400,202.35,134142195,-0.10,1.4900,0.7000,3.06,35036034.0
2015-10-16 01:00:00,202.83,203.2900,201.9200,203.27,114580052,2.75,0.9300,2.2800,0.92,-19562143.0
2015-10-19 01:00:00,202.50,203.3700,202.1300,203.37,76523897,-0.33,0.0800,0.2100,0.10,-38056155.0
2015-10-20 01:00:00,202.85,203.8400,202.5471,203.09,78448484,0.35,0.4700,0.4171,-0.28,1924587.0
2015-10-21 01:00:00,203.61,203.7900,201.6500,201.85,102038033,0.76,-0.0500,-0.8971,-1.24,23589549.0
...,...,...,...,...,...,...,...,...,...,...
2020-12-11 01:00:00,364.90,366.7402,363.2600,366.30,57698614,-0.47,-1.1198,-1.1700,-0.43,-36739.0
2020-12-14 01:00:00,368.64,369.8000,364.4700,364.66,69216174,3.74,3.0598,1.2100,-1.64,11517560.0
2020-12-15 01:00:00,367.40,369.5900,365.9200,369.59,64071099,-1.24,-0.2100,1.4500,4.93,-5145075.0
2020-12-16 01:00:00,369.82,371.1600,368.8676,370.17,58420517,2.42,1.5700,2.9476,0.58,-5650582.0


## Statistical Tests

In [53]:
# ADF

def adf_test(timeseries:pd.Series, lags="AIC", alpha=0.05):
    """
    This function returns a dataframe output_df with specified col and its moving averages per specified by days_list
    
    Input
    ========
    input_df: Pandas DatafFrame of stock prices. It is recommended that index is datetime and the columns contain "open, high, low, and close"
    
    days_list: list of integers to create moving averages. DEFAULT = [10, 50]
    
    col: name of the columns to create MA from the input_df. This must be part of input_df.columns. DEFAULT = "close"
    
    Output
    ========
    output_df: A pandas dataframe with specified col and its moving averages per specified by days_list
    
    """
    
    dftest = adfuller(timeseries, autolag=lags) #adf_result of this time series
    dfoutput = pd.Series([timeseries.name], index=["series name"]) #get inputname
    dfoutput = pd.concat([dfoutput, pd.Series(dftest[0:4], index=["Test Statistic", 
                                                                  "p-value", 
                                                                  "#Lags Used", 
                                                                  "Number of Observations Used"])]) #get adf result values
    
    #================Reject Null Hypothesis based on p-value and alpha==========================#
    # data has potential to be stationary if this is true
    if dfoutput["p-value"] <= alpha:
        dfoutput["reject null hypothesis"] = True
    else:
        dfoutput["reject null hypothesis"] = False
    
    
    
    for key,value in dftest[4].items():
        dfoutput["Critical Value (%s)"%key] = value #add critical value
        
    return dfoutput

In [55]:
adf_test(data_df.close)

series name                       close
Test Statistic                -0.626937
p-value                        0.864779
#Lags Used                            9
Number of Observations Used        1295
reject null hypothesis            False
Critical Value (1%)            -3.43541
Critical Value (5%)            -2.86377
Critical Value (10%)           -2.56796
dtype: object

In [59]:
def get_pacf_lag(data, columns:list=["close"], nlags=20):
    """
    This function returns a dataframe output_df with specified col and its moving averages per specified by days_list
    
    Input
    ========
    input_df: Pandas DatafFrame of stock prices. It is recommended that index is datetime and the columns contain "open, high, low, and close"
    
    days_list: list of integers to create moving averages. DEFAULT = [10, 50]
    
    col: name of the columns to create MA from the input_df. This must be part of input_df.columns. DEFAULT = "close"
    
    Output
    ========
    output_df: A pandas dataframe with specified col and its moving averages per specified by days_list
    
    """
    lag_list = []
    for col in columns: # for specified columns
        # PACF significance and Results
        pacf_sig = 2/np.sqrt(len(data[col]))
        pacf_results = pacf(data[col], nlags=20)
        pacf_df = pd.DataFrame(pacf_results) 
        pacf_df.columns = ["sig"]

        # Differencing the significance to create keys for sorting by largest lag change
        pacf_df_diff = abs(pacf_df.sig.diff())
        pacf_df_diff.name = "sig_diff"

        # combine PACF results and differencing
        pacf_df = pd.concat([pacf_df, pacf_df_diff], axis=1)
        # filter out any data not shown significant
        pacf_df = pacf_df[abs(pacf_df.sig) > pacf_sig]
        # sort by the largest lag change
        pacf_df.sort_values(by="sig_diff", ascending=False, inplace=True)
        
        #store lag_list
        lag_list.append(pacf_df.index[0])
   #return a dataframe of results     
    return dict(zip(columns, lag_list))

In [60]:
get_pacf_lag(data_df)

{'close': 2}