# Johansen Cointegration

In [285]:
import binance_api
import numpy as np
import pandas as pd
import math

import statsmodels
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts

import matplotlib.pyplot as plt
from datetime import datetime, date

import seaborn as sb
# https://www.marketcalls.in/amibroker/computing-cointegration-and-augmented-dickey-fuller-test-in-amibroker-using-python.html#:~:text=Augmented%20Dicky%20Fuller%20test%20is,stationary%20and%20cointegrated%20or%20not.&text=The%20Augmented%20Dicky%20Fuller%20test,want%20to%20reject%20this%20hypothesis.


In [286]:
# default values
nbr_of_coins = 20

interval_long = '1h'
interval_mid ='15m'
interval_short = '5m'

# lookback periods are in days
lookback_long = 40 * 24
lookback_mid = 10 * 4 * 24
lookback_short = 3 * 12 * 24

#startTime = round(datetime(2021, 12, 5).timestamp()) * 1000
endTime = round(datetime(2022, 2, 16).timestamp()) * 1000

In [287]:
# Get all coins, sort them by volume and keep the top x nbr of coins
tickers = pd.DataFrame(binance_api.get_all_tickers())
tickers['volume_usd'] = tickers.apply(lambda row: float(row['volume']) * float(row['lastPrice']), axis=1)
tickers = tickers.sort_values(by='volume_usd', ascending=False)
top_tickers = tickers[:nbr_of_coins]

top_tickers = top_tickers[top_tickers['symbol'] != 'PEOPLEUSDT']
top_tickers = top_tickers[top_tickers['symbol'] != 'BTCUSDT_220325']
top_tickers = top_tickers[top_tickers['symbol'] != 'ROSEUSDT']
top_tickers = top_tickers[top_tickers['symbol'] != 'API3USDT']
top_tickers = top_tickers[top_tickers['symbol'] != 'BTCBUSD']
top_tickers = top_tickers[top_tickers['symbol'] != 'ETHBUSD']
print(top_tickers['symbol'].values.tolist())

['BTCUSDT', 'ETHUSDT', 'LUNAUSDT', 'SOLUSDT', 'GALAUSDT', '1000SHIBUSDT', 'AVAXUSDT', 'ADAUSDT', 'XRPUSDT', 'FTMUSDT', 'SANDUSDT', 'MANAUSDT', 'BNBUSDT', 'MATICUSDT', 'DOTUSDT', 'ATOMUSDT', 'LINKUSDT', 'DOGEUSDT']


In [288]:
# short
historical_data_short = {}
for i in range(0, len(top_tickers)):
    historical_data_short[top_tickers.iloc[i]['symbol']] = pd.DataFrame(binance_api.get_historical_prices(top_tickers.iloc[i]['symbol'], interval_short, None, endTime))[4]
historical_prices_short = pd.DataFrame(historical_data_short)
print('Downloaded {} data'.format(interval_short))

# mid
historical_data_mid = {}
for i in range(0, len(top_tickers)):
    historical_data_mid[top_tickers.iloc[i]['symbol']] = pd.DataFrame(binance_api.get_historical_prices(top_tickers.iloc[i]['symbol'], interval_mid, None, endTime))[4]
historical_prices_mid = pd.DataFrame(historical_data_mid)
print('Downloaded {} data'.format(interval_mid))


# long
historical_data_long = {}
for i in range(0, len(top_tickers)):
    historical_data_long[top_tickers.iloc[i]['symbol']] = pd.DataFrame(binance_api.get_historical_prices(top_tickers.iloc[i]['symbol'], interval_long, None, endTime))[4]
historical_prices_long = pd.DataFrame(historical_data_long)
print('Downloaded {} data'.format(interval_long))

Downloaded 5m data
Downloaded 15m data
Downloaded 1h data


In [289]:
# Example Johansen
df = pd.DataFrame({'x': pd.to_numeric(historical_prices_mid['BTCUSDT']), 'y': pd.to_numeric(historical_prices_mid['ETHUSDT'])})
jres = coint_johansen(df, 0, 1)
print('trace: {}, critial value at 0.05%: {}, reject null? {}'.format(jres.trace_stat[0], jres.cvt[0][1], jres.trace_stat[0] > jres.cvt[0][1]))


# From my comprehension, we reject the null hypothesis if the trace of max-eigen statistic is higher than the value at the 0.05% critical value

trace: 11.10389548309806, critial value at 0.05%: 15.4943, reject null? False


In [293]:
# Cointegration Matrix
symbols = historical_prices_short.columns.tolist()

# short
coint_matrix_short = pd.DataFrame(index=symbols, columns=symbols)
for i in range(0, len(coint_matrix_short)):
    for j in range(0, len(coint_matrix_short)):
        if i != j:
            #print('{},{}'.format(i,j))
            data_1 = np.asarray(historical_prices_short[symbols[i]].tail(lookback_short)).astype(float)
            data_2 = np.asarray(historical_prices_short[symbols[j]].tail(lookback_short)).astype(float)
            if len(data_1) == len(data_2):
                df = pd.DataFrame({'x': pd.to_numeric(historical_prices_short[symbols[i]]), 'y': pd.to_numeric(historical_prices_short[symbols[j]])})
                jres = coint_johansen(df, 0, 1)
                # From my comprehension, we reject the null hypothesis if the trace of max-eigen statistic is higher than the value at the 0.05% critical value
                coint_matrix_short[symbols[i]][symbols[j]] = jres.trace_stat[0] > jres.cvt[0][1]
                
# mid
coint_matrix_mid = pd.DataFrame(index=symbols, columns=symbols)
for i in range(0, len(coint_matrix_mid)):
    for j in range(0, len(coint_matrix_mid)):
        if i != j:
            #print('{},{}'.format(i,j))
            data_1 = np.asarray(historical_prices_mid[symbols[i]].tail(lookback_mid)).astype(float)
            data_2 = np.asarray(historical_prices_mid[symbols[j]].tail(lookback_mid)).astype(float)
            if len(data_1) == len(data_2):
                df = pd.DataFrame({'x': pd.to_numeric(historical_prices_mid[symbols[i]]), 'y': pd.to_numeric(historical_prices_short[symbols[j]])})
                jres = coint_johansen(df, 0, 1)
                # From my comprehension, we reject the null hypothesis if the trace of max-eigen statistic is higher than the value at the 0.05% critical value
                coint_matrix_mid[symbols[i]][symbols[j]] = jres.trace_stat[0] > jres.cvt[0][1]

# long
coint_matrix_long = pd.DataFrame(index=symbols, columns=symbols)
for i in range(0, len(coint_matrix_long)):
    for j in range(0, len(coint_matrix_long)):
        if i != j:
            #print('{},{}'.format(i,j))
            data_1 = np.asarray(historical_prices_long[coint_matrix_long.index[i]].tail(lookback_long)).astype(float)
            data_2 = np.asarray(historical_prices_long[coint_matrix_long.index[j]].tail(lookback_long)).astype(float)
            if len(data_1) == len(data_2):
                df = pd.DataFrame({'x': pd.to_numeric(historical_prices_long[symbols[i]]), 'y': pd.to_numeric(historical_prices_long[symbols[j]])})
                jres = coint_johansen(df, 0, 1)
                # From my comprehension, we reject the null hypothesis if the trace of max-eigen statistic is higher than the value at the 0.05% critical value
                coint_matrix_long[symbols[i]][symbols[j]] = jres.trace_stat[0] > jres.cvt[0][1]

## do an AND operation between the matrices of different timeframes

In [296]:
# Short & Mid & Long
coint_short_mid_long = coint_matrix_short & coint_matrix_mid & coint_matrix_long
coint_short_mid = coint_matrix_short & coint_matrix_mid
coint_mid_long = coint_matrix_mid & coint_matrix_long

In [304]:
coint_matrix_long & coint_matrix_mid

Unnamed: 0,BTCUSDT,ETHUSDT,LUNAUSDT,SOLUSDT,GALAUSDT,1000SHIBUSDT,AVAXUSDT,ADAUSDT,XRPUSDT,FTMUSDT,SANDUSDT,MANAUSDT,BNBUSDT,MATICUSDT,DOTUSDT,ATOMUSDT,LINKUSDT,DOGEUSDT
BTCUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
ETHUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
LUNAUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
SOLUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
GALAUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1000SHIBUSDT,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False
AVAXUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
ADAUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
XRPUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
FTMUSDT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [305]:
# Get pairs that are cointegrated (short and mid and long)
cointegrated_pairs_sml = []
for i in range(0, len(coint_matrix_short)):
    for j in range(0, len(coint_matrix_short)):
        if i != j and coint_short_mid_long[symbols[i]][symbols[j]]:
            cointegrated_pairs_sml.append('{}/{}'.format(symbols[i],symbols[j]))
            
print('{} pairs are cointegrated in {}, {} and {} timeframes'.format(cointegrated_pairs_sml, interval_short, interval_mid, interval_long))

# Get pairs that are cointegrated (short and mid)
cointegrated_pairs_sm = []
for i in range(0, len(coint_matrix_short)):
    for j in range(0, len(coint_matrix_short)):
        if i != j and coint_short_mid[symbols[i]][symbols[j]]:
            cointegrated_pairs_sm.append('{}/{}'.format(symbols[i],symbols[j]))
            
print('{} pairs are cointegrated in {} and {} timeframes'.format(cointegrated_pairs_sm, interval_short, interval_mid))

# Get pairs that are cointegrated (mid and long)
cointegrated_pairs_ml = []
for i in range(0, len(coint_matrix_short)):
    for j in range(0, len(coint_matrix_short)):
        if i != j and coint_mid_long[symbols[i]][symbols[j]]:
            cointegrated_pairs_ml.append('{}/{}'.format(symbols[i],symbols[j]))
            
print('{} pairs are cointegrated in {} and {} timeframes'.format(cointegrated_pairs_ml, interval_mid, interval_long))

[] pairs are cointegrated in 5m, 15m and 1h timeframes
[] pairs are cointegrated in 5m and 15m timeframes
['GALAUSDT/1000SHIBUSDT', 'XRPUSDT/1000SHIBUSDT'] pairs are cointegrated in 15m and 1h timeframes


In [306]:
nbr_of_pairs = nbr_of_coins * nbr_of_coins
print('{:.2f}% of {} pairs are cointegrated in {}, {} and {} timeframes'.format(len(cointegrated_pairs_sml)/nbr_of_pairs*100/2, nbr_of_pairs, interval_short, interval_mid, interval_long))

nbr_of_pairs = nbr_of_coins * nbr_of_coins
print('{:.2f}% of {} pairs are cointegrated in {} and {} timeframes'.format(len(cointegrated_pairs_sm)/nbr_of_pairs*100/2, nbr_of_pairs, interval_short, interval_mid))

nbr_of_pairs = nbr_of_coins * nbr_of_coins
print('{:.2f}% of {} pairs are cointegrated in {} and {} timeframes'.format(len(cointegrated_pairs_ml)/nbr_of_pairs*100/2, nbr_of_pairs, interval_mid, interval_long))

0.00% of 400 pairs are cointegrated in 5m, 15m and 1h timeframes
0.00% of 400 pairs are cointegrated in 5m and 15m timeframes
0.25% of 400 pairs are cointegrated in 15m and 1h timeframes
