# Kalman Filter correlation search

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
import statsmodels.tsa.vector_ar.vecm as vm

from IPython.display import clear_output

In [3]:
symbols = list(pd.read_csv("../data/hour/0-pairs.csv", squeeze=True))

In [4]:
stats = pd.DataFrame(columns=["A", "B", "t", "p", "h"])

In [6]:
loc = symbols

for a in loc:
    for b in loc[loc.index(a)+1:]:
        clear_output()
        print(f"========== {a} and {b} ==========")
        df1 = pd.read_csv(f"../data/hour/{a}-hour.csv", index_col=0, parse_dates=True)
        if len(df1)<10000:
            continue
        df2 = pd.read_csv(f"../data/hour/{b}-hour.csv", index_col=0, parse_dates=True)
        if len(df2)<10000:
            continue
        df = df1.close.rename("A").to_frame()
        df["B"] = df2.close
        df = df.dropna()
        df = df[1000:]
        # cadf test
        coint_t, pvalue, crit_value = ts.coint(df['B'], df['A'])
        print(f"t-statistic={round(coint_t, 2)}, pvalue={round(pvalue, 2)}, crits={crit_value}")

        result = vm.coint_johansen(df[['A', 'B']].values, det_order=0, k_ar_diff=1)
        yport = pd.DataFrame(np.dot(df.values, result.evec[:, 0]))  # (net) market value of portfolio
        ylag = yport.shift()
        deltaY = yport - ylag
        df2 = pd.concat([ylag, deltaY], axis=1)
        df2.columns = ['ylag', 'deltaY']
        regress_results = sm.ols(formula="deltaY ~ ylag", data=df2).fit()
        halflife = -np.log(2) / regress_results.params['ylag']
        stats = stats.append({"A":a, "B":b, "t":coint_t, "p":pvalue, "h":halflife}, ignore_index=True)
        print(f'halflife={round(halflife, 0)} days')
        clear_output()
stats

Unnamed: 0,A,B,t,p,h
0,ADAUSDT,AIONUSDT,-2.275228,0.385977,396.492170
1,ADAUSDT,ALGOUSDT,-3.561068,0.027352,383.726304
2,ADAUSDT,ANKRUSDT,-2.731827,0.188341,589.788687
3,ADAUSDT,ARDRUSDT,-2.491285,0.282926,309.530976
4,ADAUSDT,ARPAUSDT,-1.796791,0.631280,775.545660
...,...,...,...,...,...
4846,XTZUSDT,ZILUSDT,-2.539095,0.262105,286.454922
4847,XTZUSDT,ZRXUSDT,-3.459891,0.036121,220.971923
4848,ZECUSDT,ZILUSDT,-2.898122,0.136450,278.392094
4849,ZECUSDT,ZRXUSDT,-3.375855,0.045104,299.603853


In [7]:
stats.to_csv("../data/pairs/coint-stats.csv", index=False)

In [12]:
stats = pd.read_csv("../data/pairs/coint-stats.csv")

In [13]:
stats = stats[stats.A != "BUSDUSDT"]
stats = stats[stats.B != "BUSDUSDT"]

In [14]:
stats = stats[stats.h > 0]
stats = stats[stats.p <= 0.05]

In [15]:
stats.sort_values("h")

Unnamed: 0,A,B,t,p,h
4806,WINUSDT,WRXUSDT,-5.242792,5.748465e-05,25.229770
3272,GTOUSDT,MITHUSDT,-8.140074,1.341800e-11,26.715229
2630,DREPUSDT,WINUSDT,-13.090466,1.846776e-23,28.139002
3487,HOTUSDT,MTLUSDT,-11.295900,1.630165e-19,30.706599
389,ARDRUSDT,BATUSDT,-7.405931,8.990992e-10,30.903800
...,...,...,...,...,...
1266,BTCUSDT,TRXUSDT,-3.574518,2.633685e-02,835.224358
2865,ETCUSDT,NULSUSDT,-4.171475,4.027924e-03,854.085550
4296,NPXSUSDT,PERLUSDT,-3.512942,3.126525e-02,995.877743
1076,BNBUSDT,NPXSUSDT,-3.727968,1.686223e-02,1117.663657


In [16]:
stats.to_csv("../data/pairs/coint-stats-filtered.csv", index=False)