In [104]:
import datetime

import numpy as np
import pandas as pd

import yfinance as yf

import plotly.express as px

import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, kpss, grangercausalitytests, coint
from statsmodels.stats.stattools import durbin_watson

import warnings
warnings.filterwarnings('ignore')

### Artificial Historical Series

We'll use some artificially generated series first as they are much cleaner and easier to work with. In general when learning or developing a new technique, use simulated data to provide a clean environment. Simulated data also allows you to control the level of noise and difficulty level for your model. a. Cointegration Test I

#### Test 1: Positive Cointegration

Determine whether the following two artificial series A and B are cointegrated using the **coint()** function and a reasonable confidence level.

In [55]:
A_returns = np.random.normal(0, 1, 100)
A = pd.Series(np.cumsum(A_returns), name='X') + 50

some_noise = np.random.exponential(1, 100)
 
B = A - 7 + some_noise # (Linear Function)

In [67]:
alpha = .05

score, p_value, _ = coint(A,B)

if p_value < alpha:
    print ("Asset A and Asset B ARE cointegrated")
    print (p_value)
else:
    print ("Asset A and Asset B AREN'T cointegrated")
    print (p_value)
    
A.name = "Asset A"
B.name = "Asset B"
fake_data = pd.concat([A, B], axis=1)

Asset A and Asset B ARE cointegrated
1.8248846276754193e-18


In [68]:
fig = px.line(fake_data, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

#### Test 2: Negative Cointegration

Determine whether the following two artificial series C and D are cointegrated using the coint() function and a reasonable confidence level.

In [62]:
C_returns = np.random.normal(1, 1, 100) 
C = pd.Series(np.cumsum(C_returns), name='X') + 100

D_returns = np.random.normal(2, 1, 100)
D = pd.Series(np.cumsum(D_returns), name='X') + 100

In [65]:
alpha = .05

score, p_value, _ = coint(C,D)

if p_value < alpha:
    print ("Asset C and Asset D ARE cointegrated")
    print (p_value)
else:
    print ("Asset C and Asset D AREN'T cointegrated")
    print (p_value)

C.name = "Asset C"
D.name = "Asset D"
fake_data = pd.concat([C, D], axis=1)

Asset C and Asset D AREN'T cointegrated
0.17854737392973746


In [66]:
fig = px.line(fake_data, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

### Let's test on some real data

Adjusted close stock price for PETR4, VALE3 and PRIO3.

In [71]:
start_date = datetime.datetime(2020,1,1)
end_date   = datetime.datetime.now()

df_petr4 = yf.download(tickers=['PETR4.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']
df_vale3 = yf.download(tickers=['VALE3.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']
df_prio3 = yf.download(tickers=['PRIO3.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [72]:
alpha = .05

score, p_value, _ = coint(df_petr4, df_vale3)

if p_value < alpha:
    print ("PETR4 and VALE3 ARE cointegrated")
    print (p_value)
else:
    print ("PETR4 and VALE3 AREN'T cointegrated")
    print (p_value)

df_petr4.name = "PETRA4"
df_vale3.name = "VALE3"
petra4_and_vale3 = pd.concat([df_petr4, df_vale3], axis=1)

PETR4 and VALE3 AREN'T cointegrated
1.0


In [73]:
fig = px.line(petra4_and_vale3, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [75]:
alpha = .05

score, p_value, _ = coint(df_prio3, df_petr4)

if p_value < alpha:
    print ("PETR4 and PRIO3 ARE cointegrated")
    print (p_value)
else:
    print ("PETR4 and PRIO3 AREN'T cointegrated")
    print (p_value)

df_petr4.name = "PETRA4"
df_prio3.name = "PRIO3"
petra4_and_prio3 = pd.concat([df_petr4, df_prio3], axis=1)

PETR4 and PRIO3 AREN'T cointegrated
0.41063836389870445


In [76]:
fig = px.line(petra4_and_vale3, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

Unfortunately none of assets choosed by us are cointegrated, wha we can do now? 

### Searching for Cointegrated Pairs

Let's define a **find_cointegrated_pairs** function, to help us to find any cointegrated pairs among a set assets.

In [90]:
# Useful Functions
def find_cointegrated_pairs(data):
    
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    p_value_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    
    for i in range(n):
        for j in range(i+1, n):

            asset_1 = data[keys[i]]
            asset_2 = data[keys[j]]

            result = coint(asset_1, asset_2)

            score = result[0]
            p_value = result[1]
            score_matrix[i, j] = score
            p_value_matrix[i, j] = p_value

            if p_value < 0.05:
                pairs.append((keys[i], keys[j]))

    return score_matrix, p_value_matrix, pairs

In [124]:
stocks = ['VALE3','GGBR4','CPLE6','BBSE3','ITUB4', 'WEGE3','JBSS3','PRIO3', 'MULT3', 'ITSA4']

tickers = (pd.Series(stocks)+'.SA').tolist()

data = yf.download(tickers=tickers, start=start_date, end = end_date, rounding=True)['Adj Close']

# symbol_list = ['MTRN', 'CMP', 'TRQ', 'SCCO', 'HCLP','SPY']
# prices_df = get_pricing(symbol_list, fields=['price']
#                                , start_date='2015-01-01', end_date='2016-01-01')['price']
# prices_df.columns = map(lambda x: x.symbol, prices_df.columns)

[*********************100%%**********************]  10 of 10 completed


In [125]:
scores, pvalues, pairs = find_cointegrated_pairs(data)

In [126]:
fig = px.imshow(pvalues, 
                labels=dict(x="Assets Correlation", y="Assets", color="p-value"),
                x=stocks, y=stocks,
                aspect="auto", text_auto=True)
fig.show()

### Model Validation

In [127]:
asset1  = data['PRIO3.SA']
asset2 = data['JBSS3.SA']

In [131]:
alpha = .05

score, p_value, _ = coint(asset1, asset2)

if p_value < alpha:
    print ("JBSS3 and PRIO3 ARE cointegrated")
    print (p_value)
else:
    print ("JBSS3 and PRIO3 AREN'T cointegrated")
    print (p_value)

asset2.name = "JBSS3"
asset1.name = "PRIO3"
jbss3_and_prio3 = pd.concat([asset2, asset1], axis=1)

JBSS3 and PRIO3 AREN'T cointegrated
0.9504702112382228


In [135]:
fig = px.line(jbss3_and_prio3, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [110]:
asset1 = sm.add_constant(asset1)
results = sm.OLS(asset2, asset1).fit()

In [111]:
results.params

const       22.420862
PRIO3.SA     0.022851
dtype: float64

In [112]:
asset1 = sm.add_constant(asset1)
results = sm.OLS(asset2, asset1).fit()
b = results.params['PRIO3.SA']
asset1 = asset1['PRIO3.SA']

print (b)
spread = asset2 - b * asset1
print (f"p-value for in-sample stationarity: {adfuller(spread)[1]}")



0.022851490863802462
p-value for in-sample stationarity: 0.6748769124117779


The p-value is less than **0.05** so we conclude that this spread calculation is stationary in sample


In [116]:
fig = px.line(spread, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.add_hline(y=spread.mean())
fig.show()

### A more deepest analysis

In [2]:
start_date = datetime.datetime(2020,1,1)
end_date   = datetime.datetime.now()

# df_apple = yf.download(tickers=['AAPL'], start=start_date, end = end_date, rounding=True)['Adj Close']
# df_walmart = yf.download(tickers=['WMT'], start=start_date, end = end_date, rounding=True)['Adj Close']
# df_tesla = yf.download(tickers=['TSLA'], start=start_date, end = end_date, rounding=True)['Adj Close']

df_petr4 = yf.download(tickers=['PETR4.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']
df_vale3 = yf.download(tickers=['VALE3.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']
df_prio3 = yf.download(tickers=['PRIO3.SA'], start=start_date, end = end_date, rounding=True)['Adj Close']

# df_apple = pd.read_csv('data/AAPL.csv')
# df_walmart = pd.read_csv('data/WMT.csv')
# df_tesla = pd.read_csv('data/TSLA.csv')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [3]:
df_petr4.shape, df_vale3.shape, df_prio3.shape

((1000,), (1000,), (1000,))

In [4]:
data = pd.concat([df_petr4, df_vale3, df_prio3], axis=1)
data.columns = ['petrobras', 'vale', 'petrorio']
data

Unnamed: 0_level_0,petrobras,vale,petrorio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-02,11.30,38.04,7.14
2020-01-03,11.21,37.76,7.56
2020-01-06,11.34,37.54,7.71
2020-01-07,11.30,37.81,7.72
2020-01-08,11.23,37.82,7.56
...,...,...,...
2024-01-04,38.63,75.62,46.42
2024-01-05,38.72,74.65,46.08
2024-01-08,38.43,74.27,45.75
2024-01-09,38.10,73.33,46.28


In [5]:
data.columns

Index(['petrobras', 'vale', 'petrorio'], dtype='object')

In [6]:
data.isnull().sum()

petrobras    0
vale         0
petrorio     0
dtype: int64

## Visualize the Time Series

In [7]:
fig = px.line(data, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [8]:
fig = px.area(data, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

Time series data preparation process has several steps we consider as data mining process.

Cross-check ADF test and KPSS test.

## ADF Test for Stationarity (Unitary Root)

***Null hypothesis (H0)***: If failed to be rejected, it suggests the time series is not stationarity

***Alternative hypothesis(H1)***: The null hypothesis is rejected, it suggests the time series is stationary

In [9]:
n_obs = 20
train, test = data[0:-n_obs], data[-n_obs:]

In [10]:
train.shape, test.shape

((980, 3), (20, 3))

In [11]:
def adf_test(df):
    result = adfuller(df.values)
    print('ADF Statistics: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

In [12]:
print('ADF Test: Petrobras time series')
adf_test(train['petrobras'])
print()
print('ADF Test: Vale time series')
adf_test(train['vale'])
print()
print('ADF Test: Petrorio time series')
adf_test(train['petrorio'])

ADF Test: Petrobras time series
ADF Statistics: 0.871003
p-value: 0.992700
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568

ADF Test: Vale time series
ADF Statistics: -1.810292
p-value: 0.375364
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568

ADF Test: Petrorio time series
ADF Statistics: -0.595281
p-value: 0.872045
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568


The p-values are all well above the 0.05 alpha level, we fail in reject the null hypothesis. So the three time series are NOT stationary.

## KPSS test for Stationary

***Null hypothesis (H0)***: The time series is stationarity

***Alternative hypothesis (H1)***: The time series is non-stationary

In [13]:
def kpss_test(df):    
    statistic, p_value, n_lags, critical_values = kpss(df.values)
    
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')

In [14]:
print('KPSS Test: Petrobras time series')
kpss_test(train['petrobras'])
print()
print('KPSS Test: Vale time series')
kpss_test(train['vale'])
print()
print('KPSS Test: Petrorio time series')
kpss_test(train['petrorio'])

KPSS Test: Petrobras time series
KPSS Statistic: 4.25131132656736
p-value: 0.01
num lags: 19
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739

KPSS Test: Vale time series
KPSS Statistic: 2.1292123837158994
p-value: 0.01
num lags: 19
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739

KPSS Test: Petrorio time series
KPSS Statistic: 4.630115259336143
p-value: 0.01
num lags: 19
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739


The p-value are all less than 0.05 alpha level, we can reject the null hypothesis and derive that the three time series are NOT stationary.

After cross-check ADF test and KPSS test. We can conclude that the three time series data we have here are not stationary. We will transform the time series to be stationary by difference method.

In [15]:
df_train_transformed = train.diff().dropna()

In [16]:
fig = px.line(df_train_transformed, facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [17]:
print('ADF Test: Petrobras time series transformed')
adf_test(df_train_transformed['petrobras'])
print()
print('ADF Test: Vale time series transformed')
adf_test(df_train_transformed['vale'])
print()
print('ADF Test: Petrorio time series transformed')
adf_test(df_train_transformed['petrorio'])

ADF Test: Petrobras time series transformed
ADF Statistics: -31.569718
p-value: 0.000000
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568

ADF Test: Vale time series transformed
ADF Statistics: -30.826968
p-value: 0.000000
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568

ADF Test: Petrorio time series transformed
ADF Statistics: -30.212569
p-value: 0.000000
Critical values:
	1%: -3.437
	5%: -2.864
	10%: -2.568


After transform the data, the p-values are all well below the 0.05 alpha level, we reject the null hypothesis. So the current data is stationary. Let's check with KPSS test as well.

In [18]:
print('KPSS Test: Patrobras time series transformed')
kpss_test(df_train_transformed['petrobras'])
print()
print('KPSS Test: Vale time series transformed')
kpss_test(df_train_transformed['vale'])
print()
print('KPSS Test: Petrorio time series transformed')
kpss_test(df_train_transformed['petrorio'])

KPSS Test: Patrobras time series transformed
KPSS Statistic: 0.38881978851881965
p-value: 0.08197422908671567
num lags: 4
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739

KPSS Test: Vale time series transformed
KPSS Statistic: 0.12558110358543592
p-value: 0.1
num lags: 3
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739

KPSS Test: Petrorio time series transformed
KPSS Statistic: 0.03929836741570954
p-value: 0.1
num lags: 6
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739


Some of the KPSS Null Hypothesis could not be rejected.

## VAR model

The VAR class assumes that the passed time series are stationary. Non-stationary or trending data can often be transformed to be stationary by first-differencing or some other method.

Decide the Order (_p_) of VAR model : For our data decide on which lag is better to fit model. Choosed lowest AIC and other metrics.

In [19]:
model = VAR(df_train_transformed)

for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)
    print('BIC : ', result.bic)
    print('FPE : ', result.fpe)
    print('HQIC: ', result.hqic, '\n')

Lag Order = 1
AIC :  -1.926077363529201
BIC :  -1.8661324596023936
FPE :  0.14571870020492736
HQIC:  -1.9032695314901908 

Lag Order = 2
AIC :  -1.9095512026207484
BIC :  -1.8045622367119474
FPE :  0.14814696858012594
HQIC:  -1.869603030767671 

Lag Order = 3
AIC :  -1.8955290610600608
BIC :  -1.7454226291006418
FPE :  0.15023915171676747
HQIC:  -1.83841091711028 

Lag Order = 4
AIC :  -1.8801842110429718
BIC :  -1.684886712203058
FPE :  0.15256272289892905
HQIC:  -1.8058663812208189 

Lag Order = 5
AIC :  -1.8713665706124634
BIC :  -1.6308042065725847
FPE :  0.15391454957209952
HQIC:  -1.7798192593432638 

Lag Order = 6
AIC :  -1.8555076361711018
BIC :  -1.5696064103904213
FPE :  0.15637587091833188
HQIC:  -1.7467009657737496 

Lag Order = 7
AIC :  -1.8468541236424103
BIC :  -1.5155398406234917
FPE :  0.15773625283932566
HQIC:  -1.7207581340204756 

Lag Order = 8
AIC :  -1.8367401413260047
BIC :  -1.4599384058761082
FPE :  0.15934143433595727
HQIC:  -1.6933247896573718 

Lag Order = 9

There is no hard-and-fast-rule on the choice of lag order. It is basically an empirical issue. However, it is often advised to use the AIC in selecting the lag order with the smallest value. Here we will select lag order = 15.

In [20]:
model.select_order(15)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults at 0x2259a32b0d0>

Select the lowest AIC lag.

In [21]:
results = model.fit(maxlags=15, ic='aic')
results.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Thu, 11, Jan, 2024
Time:                     02:46:58
--------------------------------------------------------------------
No. of Equations:         3.00000    BIC:                   -1.92536
Nobs:                     979.000    HQIC:                  -1.93464
Log likelihood:          -3214.63    FPE:                   0.143656
AIC:                     -1.94034    Det(Omega_mle):        0.143216
--------------------------------------------------------------------
Results for equation petrobras
           coefficient       std. error           t-stat            prob
------------------------------------------------------------------------
const         0.023687         0.013294            1.782           0.075

Results for equation vale
           coefficient       std. error           t-stat            prob
-------------------------------------------------------------

The biggest correlation is 0.50 (Apple & Tesla).

## Check the Durbin-Watson statistic

Serial correlation of residuals is used to check if there is any leftover pattern in the residuals (errors). If there is any correlation left in the residuals, then, there is some pattern in the time series that is still left to be explained by the model. In that case, the typical course of action is to either increase the order of the model or induce more predictors into the system or look for a different algorithm to model the time series.

The null hypothesis of the test is that there is no serial correlation in the residuals. The Durbin-Watson test statistic is defined as:

The test statistic is approximately equal to 2*(1-r) where **r** is the sample autocorrelation of the residuals. Thus, for **r == 0**, indicating no serial correlation, the test statistic equals 2. This statistic will always be between **0** and **4**. The closer to **0** the statistic, the more evidence for positive serial correlation. The closer to **4**, the more evidence for negative serial correlation.

\begin{equation}
    \sum_{t=2}^{T}((e_{t}-e_{t-1})^2)/\sum_{t=1}^{T}e_{t}^2 \\
\end{equation}

In [22]:
out = durbin_watson(results.resid)

for col, val in zip(data.columns, out):
    print(col, ':', round(val, 2))

petrobras : 2.02
vale : 1.97
petrorio : 1.93


A value of 2.0 (or around it) means that there is no autocorrelation detected in the residuals.

## Granger Causality Test

In [23]:
maxlag=15
test = 'ssr_chi2test'

def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
   
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=True)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

grangers_causation_matrix(df_train_transformed, variables = df_train_transformed.columns)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=-0.0000 , p=1.0000  , df_denom=976, df_num=1
ssr based chi2 test:   chi2=-0.0000 , p=1.0000  , df=1
likelihood ratio test: chi2=-0.0000 , p=1.0000  , df=1
parameter F test:         F=0.1814  , p=0.6702  , df_denom=976, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.0000  , p=1.0000  , df_denom=974, df_num=2
ssr based chi2 test:   chi2=0.0000  , p=1.0000  , df=2
likelihood ratio test: chi2=-0.0000 , p=1.0000  , df=2
parameter F test:         F=0.2547  , p=0.7752  , df_denom=974, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.0000  , p=1.0000  , df_denom=972, df_num=3
ssr based chi2 test:   chi2=0.0000  , p=1.0000  , df=3
likelihood ratio test: chi2=-0.0000 , p=1.0000  , df=3
parameter F test:         F=0.9190  , p=0.4310  , df_denom=972, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.0000  , p=1.0000  

Unnamed: 0,petrobras_x,vale_x,petrorio_x
petrobras_y,1.0,0.1867,0.3893
vale_y,0.0497,1.0,0.8899
petrorio_y,0.3279,0.1668,1.0


The row are the Response (Y) and the columns are the predictor series (X). If a given _p-value_ is < **significance level** (0.05), for example, when we take the value 0.03 in (row 2, column 3), we can reject the null hypothesis and conclude that **tesla_x** Granger causing **walmart_y**. 
<!-- Likewise, the 0.0 in (row 2, column 1) refers to walmart_y Granger causing apple_x. -->

<!-- All the time series in the above data are interchangeably Granger causing each other. -->

In [24]:
lag_order = result.k_ar
print(lag_order)

15


## Forecast

In [40]:
df_train_transformed.values[-lag_order:]

array([[ 0.03,  1.8 ,  1.04],
       [-0.37,  1.82,  0.45],
       [ 0.15, -2.28, -0.74],
       [ 0.01, -0.47,  0.  ],
       [ 0.1 , -0.63, -0.03],
       [-0.2 , -0.44, -0.98],
       [ 0.53,  0.16, -0.13],
       [-0.37, -0.02, -0.18],
       [ 0.68,  0.4 ,  0.02],
       [-0.24,  1.37, -1.36],
       [-0.76, -1.69, -0.35],
       [-0.16, -0.68, -1.06],
       [-1.25, -0.48, -1.72],
       [-0.08,  0.23,  0.98],
       [ 1.07,  0.2 ,  2.13]])

In [41]:
df_input = df_train_transformed.values[-lag_order:]
df_forecast = results.forecast(y=df_input, steps=15)
df_forecast = (pd.DataFrame(df_forecast, index=test.index, columns=test.columns + '_pred'))

IndexError: index 0 is out of bounds for axis 0 with size 0

In [31]:
def invert_transformation(df, pred):
    forecast = df_forecast.copy()
    columns = df.columns
    for col in columns:
        forecast[str(col)+'_pred'] = df[col].iloc[-1] + forecast[str(col)+'_pred'].cumsum()
    return forecast
output = invert_transformation(df_train, df_forecast)

In [32]:
output

company,apple_pred,walmart_pred,tesla_pred
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-11-20,119.526205,151.759086,509.314704
2020-11-23,120.218499,153.136059,516.392665
2020-11-24,119.914257,153.234075,510.919619
2020-11-25,118.250224,152.394891,498.043869
2020-11-27,117.696289,152.065309,494.52758
2020-11-30,118.152242,152.838528,489.821496
2020-12-01,118.581646,153.648201,485.367727
2020-12-02,118.872377,153.622405,486.124842
2020-12-03,120.841659,154.285421,497.479445
2020-12-04,121.548198,155.107928,504.946124


In [33]:
combined = pd.concat([output['apple_pred'], df_test['apple'], output['walmart_pred'], df_test['walmart'], output['tesla_pred'], df_test['tesla']], axis=1)

In [34]:
combined

Unnamed: 0_level_0,apple_pred,apple,walmart_pred,walmart,tesla_pred,tesla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-20,119.526205,117.339996,151.759086,149.692825,509.314704,489.609985
2020-11-23,120.218499,113.849998,153.136059,150.380295,516.392665,521.849976
2020-11-24,119.914257,115.169998,153.234075,150.808746,510.919619,555.380005
2020-11-25,118.250224,116.029999,152.394891,151.277039,498.043869,574.0
2020-11-27,117.696289,116.589996,152.065309,151.047882,494.52758,585.76001
2020-11-30,118.152242,119.050003,152.838528,152.233536,489.821496,567.599976
2020-12-01,118.581646,122.720001,153.648201,152.084076,485.367727,584.76001
2020-12-02,118.872377,123.080002,153.622405,149.971802,486.124842,568.820007
2020-12-03,120.841659,122.940002,154.285421,148.756256,497.479445,593.380005
2020-12-04,121.548198,122.25,155.107928,148.367676,504.946124,599.039978


In [35]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


rmse = mean_squared_error(combined['apple_pred'], combined['apple'], squared=False)
mae = mean_absolute_error(combined['apple_pred'], combined['apple'])

print('Forecast accuracy of Apple')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Apple
RMSE:  7.31
MAE:  6.0


In [36]:
rmse = mean_squared_error(combined['walmart_pred'], combined['walmart'], squared=False)
mae = mean_absolute_error(combined['walmart_pred'], combined['walmart'])

print('Forecast accuracy of Walmart')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Walmart
RMSE:  5.05
MAE:  4.52


In [37]:
rmse = mean_squared_error(combined['tesla_pred'], combined['tesla'], squared=False)
mae = mean_absolute_error(combined['tesla_pred'], combined['tesla'])

print('Forecast accuracy of Tesla')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Tesla
RMSE:  125.16
MAE:  113.18
