- [Johansen Cointegration Test: Learn How to Implement it in Python](https://blog.quantinsti.com/johansen-test-cointegration-building-stationary-portfolio/)
- [Unveiling Cointegration: Johansen Test Explained with Python Examples](https://medium.com/@cemalozturk/unveiling-cointegration-johansen-test-explained-with-python-examples-db8385219f1f)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import statsmodels.api as sm

from itertools import combinations, permutations
from time import sleep
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.vecm import coint_johansen

In [2]:
# Import price data
df = pd.read_csv('prices.txt', engine='python', sep='   ', header=None, names=[f"stock{i}" for i in range(50)])
stocks = df.columns

print(stocks)
df['stock0'].name

Index(['stock0', 'stock1', 'stock2', 'stock3', 'stock4', 'stock5', 'stock6',
       'stock7', 'stock8', 'stock9', 'stock10', 'stock11', 'stock12',
       'stock13', 'stock14', 'stock15', 'stock16', 'stock17', 'stock18',
       'stock19', 'stock20', 'stock21', 'stock22', 'stock23', 'stock24',
       'stock25', 'stock26', 'stock27', 'stock28', 'stock29', 'stock30',
       'stock31', 'stock32', 'stock33', 'stock34', 'stock35', 'stock36',
       'stock37', 'stock38', 'stock39', 'stock40', 'stock41', 'stock42',
       'stock43', 'stock44', 'stock45', 'stock46', 'stock47', 'stock48',
       'stock49'],
      dtype='object')


'stock0'

In [3]:
# Make plot directories
for dirname in ['acf-pacf/', 'adfuller/']:
    if not os.path.isdir(dirname):
        os.makedirs(dirname)

#### Autocorrelation + Partial Autocorrelation

In [4]:
def acf_pacf_plot(data, lags=20):
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
    sm.graphics.tsa.plot_acf(data.values.squeeze(), ax=ax[0], lags=lags)
    sm.graphics.tsa.plot_pacf(data.values.squeeze(), ax=ax[1], lags=lags)
    fig.savefig(f"acf-pacf/{data.name}.png")
    plt.close(fig)

# Plot ACF/PACF of all stocks
for stock in stocks:
    acf_pacf_plot(df[stock])

#### Augmented Dicky-Fuller Test

In [5]:
def adf_test(data, y_str, x_str, significance=0.1, plot_threshold=90):
    Y, X = data[y_str], data[x_str]

    # Hedge ratio
    model = sm.OLS(Y, X)
    model = model.fit()
    constant = model.params.iloc[0]

    # Spread
    spread = Y - constant * X

    # Compute ADF test statistics
    adf = adfuller(spread, regression='c', autolag='AIC')
    t_stat = adf[0]
    p_val = adf[1]
    lag = adf[2]
    crit_90 = adf[4]['10%']
    crit_95 = adf[4]['5%']
    crit_99 = adf[4]['1%']

    # Check co-integration
    if t_stat <= crit_90 and p_val <= significance:
        confidence = 90
    elif t_stat <= crit_95 and p_val <= significance:
        confidence = 95
    elif t_stat <= crit_99 and p_val <= significance:
        confidence = 99
    else:
        confidence = None

    # Save spread plot for reference
    if confidence and confidence >= plot_threshold:
        title = f"Cointegration between y={y_str} x={x_str}:\n{confidence}% confidence"

        fig, (ax1, ax2) = plt.subplots(1, 2)
        # plot spread
        ax1.plot(spread)
        ax1.set_ylabel('Spread')

        # plot historical prices
        ax2.plot(Y, label=y_str)
        ax2.plot(X, label=x_str)
        ax2.set_ylabel('Price')
        ax2.yaxis.set_label_position('right')
        ax2.yaxis.tick_right()
        ax2.legend(loc='lower left')

        plt.suptitle(title)
        plt.savefig(f"adfuller/{y_str}-{x_str}")
        plt.close(fig)

    return confidence, p_val, lag

# Conduct ADF test for all stocks
for s1, s2 in permutations(stocks, 2):
    adf_test(df[[s1, s2]], s1, s2, significance=0.01, plot_threshold=90)

y_stock = 'stock0'
x_stock = 'stock30'
# adf_test(df[[y_stock, x_stock]], y_stock, x_stock)

#### Johansen Cointegration Test

In [None]:
def johansen_test(data, det_order=0, k_ar_diff=1):
    """Performs the Johansen cointegration test and prints the results.

    Args:
        data (np.ndarray): Time series data for cointegration test
        det_order (int): The order of the deterministic terms. Defaults to 0.
                         -1: (most restrictive) No constant or trend
                          0: Constant term only
                         -1: (least restrictive) Constant and trend terms
        k_ar_diff (int): The number of lags to include in the VAR model. Defaults to 1.
    """

    test_result = coint_johansen(data, det_order, k_ar_diff)

    # Extract the trace statistics and eigen statistics
    trace_stats = test_result.lr1
    eigen_stats = test_result.lr2
    trace_crit = test_result.cvt
    eigen_crit = test_result.cvm

    print(f'Johansen Test Results (det_order={det_order})')
    print('Trace Statistics:', trace_stats)
    print('Max Eigenvalue Statistics:', eigen_stats)
    print('Critical Values (Eigenvalue):', trace_crit)
    print('Critical Values (Trace):', eigen_crit)

    return test_result

johansen_test(df[['stock0', 'stock1', 'stock2', 'stock4']])

### Testing the exact number of cointegrating relationships

In [None]:


# Interpret the results for each pair
for i, (stock1, stock2) in enumerate(combinations(rand_stocks, 2)):
    if (tracevalues[i] > critical_values[:, 1]).all():
        print(f"Pair {i + 1} ({stock1} and {stock2}) is cointegrated.")
    else:
        print(f"Pair {i + 1} ({stock1} and {stock2}) is not cointegrated.")