In [None]:
# https://www.kdnuggets.com/2020/01/predict-electricity-consumption-time-series-analysis.html
# Download the data

In [None]:
import pandas as pd
import pandas_datareader.data as data

start_date = '2001-01-01'
end_date = '2019-12-31'

predictors = {
    'sp500': {
      'ticker': '^GSPC',
    },
    'gdax': {
      'ticker': '^GDAXI',
    },
    'nikkei': {
      'ticker': '^N225',
    },
    'gold': {
      'ticker': 'GLD',
    },
    '10y_treasury': {
      'ticker': '^TNX',
    },
    'APPL': {
      'ticker': 'APPL',
    },
}

for predictor in predictors:
    predictors[predictor]['data'] = data.DataReader(predictors[predictor]['ticker'], 'yahoo', start_date, end_date)
    

In [None]:
df = None
drop_cols = ['High','Low','Open','Adj Close','Volume']

for predictor in predictors:
    predictors[predictor]['data_mod'] = predictors[predictor]['data'].rename(columns={"Close": predictor}, errors="raise")
    predictors[predictor]['data_mod'] = predictors[predictor]['data_mod'].drop(drop_cols, axis=1)
    if df is None:
        df = predictors[predictor]['data_mod']
    else:     
        df = df.join(predictors[predictor]['data_mod'], how='outer')

df.tail(10)

In [None]:
df.to_csv('../data/stock.csv')

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

col_dtypes = {
    'sp500': np.float32,
    'gdax': np.float32,
    'nikkei': np.float32,
    'gold': np.float32,
    '10y_treasury': np.float32,
}

dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')

df = pd.read_csv('../data/stock.csv', parse_dates=['Date'], date_parser=dateparse, dtype=col_dtypes)
df.tail(10)

In [None]:
df.dropna(inplace=True)
df.info(memory_usage='deep')

In [None]:
import plotly.express as px
fig = px.line(df, x='Date', y='sp500')
fig.show()

In [None]:
#from sklearn.preprocessing import StandardScaler
df_scaled = df.copy()
#df_scaled.iloc[:,1:] = StandardScaler().fit_transform(df_scaled.iloc[:,1:])
#df_scaled.iloc[:,1:] = np.log(df_scaled.iloc[:,1:])
#df_scaled.head()

In [None]:
import plotly.graph_objects as go

mode = 'lines'
#mode = 'lines+markers'

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scaled['Date'], y=df_scaled['sp500'],
                    mode=mode,
                    name='sp500'))
fig.add_trace(go.Scatter(x=df_scaled['Date'], y=df_scaled['gdax'],
                    mode=mode,
                    name='gdax'))
fig.add_trace(go.Scatter(x=df_scaled['Date'], y=df_scaled['gold'],
                    mode=mode,
                    name='gold'))
fig.add_trace(go.Scatter(x=df_scaled['Date'], y=df_scaled['nikkei'],
                    mode=mode,
                    name='nikkei'))
fig.add_trace(go.Scatter(x=df_scaled['Date'], y=df_scaled['10y_treasury'],
                    mode=mode,
                    name='10y_treasury'))
fig.show()

In [None]:
#import matplotlib.pyplot as plt
#plt.style.use('fivethirtyeight')
#from pylab import rcParams
#rcParams['figure.figsize'] = 15, 10

from statsmodels.tsa.stattools import adfuller
def graph_stationarity(timeseries, column):
    #Determing rolling statistics
    rolmean = timeseries[column].rolling(100).mean()
    rolstd = timeseries[column].rolling(100).std()
    #Plot rolling statistics:
    #plt.plot(timeseries, color='blue',label='Original')
    #plt.plot(rolmean, color='red', label='Rolling Mean')
    #plt.plot(rolstd, color='black', label = 'Rolling Std')
    #plt.legend(loc='best')
    #plt.title('Rolling Mean and Standard Deviation')
    #plt.show(block=False)
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=timeseries[column],
                        mode=mode,
                        name=column))
    fig.add_trace(go.Scatter(y=rolmean,
                        mode=mode,
                        name='rollingmean'))
    fig.add_trace(go.Scatter(y=rolstd,
                        mode=mode,
                        name='rollingstd'))
    fig.show()

def dickey_fuller(timeseries, column):
    #perform dickey fuller test  
    print("Results of Dickey Fuller test")
    adft = adfuller(timeseries[column],autolag='AIC')
    # output for dft will give us without defining what the values are.
    #hence we manually write what values does it explains using a for loop
    output = pd.Series(adft[0:4],index=['Test Statistics','p-value','No. of lags used','Number of observations used'])
    for key,values in adft[4].items():
        output['critical value (%s)'%key] =  values
    print(output)
    
graph_stationarity(df, 'sp500')
dickey_fuller(df, 'sp500')

In [None]:
# make the data stationary
# first take the log 
# the calculate a rolling mean and stddev

moving_avg = df_scaled.rolling(100).mean()
std_dev = df_scaled.rolling(100).std()

moving_avg.tail(14)

In [None]:
#take the difference of the series and the mean at every point in the series.

df_log_moving_avg_diff = df_scaled.drop(['Date'],axis=1) - moving_avg
df_log_moving_avg_diff.dropna(inplace=True)

In [None]:
fig = px.line(df_log_moving_avg_diff, y='sp500')
fig.show()

In [None]:
graph_stationarity(df_log_moving_avg_diff, 'sp500')
dickey_fuller(df_log_moving_avg_diff, 'sp500')

In [None]:
weighted_average = df_scaled.ewm(halflife=365, min_periods=0, adjust=True).mean()

In [None]:
logScale_weightedMean = df_scaled.drop(['Date'], axis=1)-weighted_average
graph_stationarity(logScale_weightedMean, 'sp500')
dickey_fuller(df_log_moving_avg_diff, 'sp500')

In [None]:
# use differencing instead
df_scaled_diff = df_scaled - df_scaled.shift()
df_scaled_diff.dropna(inplace=True)
graph_stationarity(df_scaled_diff,'sp500')
dickey_fuller(df_scaled_diff,'sp500')