# Kalman Filter correlation search

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
import statsmodels.tsa.vector_ar.vecm as vm

from IPython.display import clear_output

In [7]:
symbols = list(pd.read_csv("../data/testing-hourly/0-pairs.csv", squeeze=True))

In [47]:
df1 = pd.read_csv("../data/testing-hourly/ZECUSDT-hour.csv", index_col=0, parse_dates=True)
df2 = pd.read_csv("../data/testing-hourly/XMRUSDT-hour.csv", index_col=0, parse_dates=True)

In [50]:
stats = pd.DataFrame(columns=["A", "B", "t", "p", "h"])

In [51]:
loc = symbols

for a in loc:
    for b in loc[loc.index(a)+1:]:
        clear_output()
        print(f"========== {a} and {b} ==========")
        df1 = pd.read_csv(f"../data/testing-hourly/{a}-hour.csv", index_col=0, parse_dates=True)
        if len(df1) < 20000:
            continue
        df2 = pd.read_csv(f"../data/testing-hourly/{b}-hour.csv", index_col=0, parse_dates=True)
        if len(df2) < 20000:
            continue
        df = df1.close.rename("A").to_frame()
        df["B"] = df2.close
        df = df.dropna()
        df = df[1000:]
        # cadf test
        coint_t, pvalue, crit_value = ts.coint(df['B'], df['A'])
        print(f"t-statistic={round(coint_t, 2)}, pvalue={round(pvalue, 2)}, crits={crit_value}")

        result = vm.coint_johansen(df[['A', 'B']].values, det_order=0, k_ar_diff=1)
        yport = pd.DataFrame(np.dot(df.values, result.evec[:, 0]))  # (net) market value of portfolio
        ylag = yport.shift()
        deltaY = yport - ylag
        df2 = pd.concat([ylag, deltaY], axis=1)
        df2.columns = ['ylag', 'deltaY']
        regress_results = sm.ols(formula="deltaY ~ ylag", data=df2).fit()
        halflife = -np.log(2) / regress_results.params['ylag']
        stats = stats.append({"A":a, "B":b, "t":coint_t, "p":pvalue, "h":halflife}, ignore_index=True)
        print(f'halflife={round(halflife, 0)} days')
        clear_output()
stats

Unnamed: 0,A,B,t,p,h
0,ADAUSDT,BATUSDT,-2.367053,0.340614,640.911332
1,ADAUSDT,BNBUSDT,-4.142902,0.004445,633.786976
2,ADAUSDT,BTCUSDT,-2.452713,0.300312,1319.580604
3,ADAUSDT,BTTUSDT,-3.311399,0.053188,634.386944
4,ADAUSDT,CELRUSDT,-3.223612,0.066067,477.700251
...,...,...,...,...,...
523,XRPUSDT,ZILUSDT,-2.988906,0.112866,546.327286
524,XRPUSDT,ZRXUSDT,-2.543192,0.260359,553.126809
525,ZECUSDT,ZILUSDT,-2.898122,0.136450,278.392094
526,ZECUSDT,ZRXUSDT,-3.375855,0.045104,299.603853


In [52]:
stats.to_csv("../data/testing-hourly/0-stats.csv", index=False)

In [56]:
stats.sort_values("h")[:20]

Unnamed: 0,A,B,t,p,h
154,CELRUSDT,FETUSDT,-6.228586,5.133366e-07,54.134228
128,BTTUSDT,HOTUSDT,-6.798772,2.564003e-08,64.272073
165,CELRUSDT,ONGUSDT,-6.151417,7.59844e-07,68.949839
259,ETHUSDT,NANOUSDT,-7.752715,1.254302e-10,69.544149
338,IOSTUSDT,IOTAUSDT,-7.153289,3.687065e-09,76.353783
417,NANOUSDT,XLMUSDT,-6.203437,5.835447e-07,82.543564
408,NANOUSDT,NEOUSDT,-4.520835,0.001131589,89.847225
51,BATUSDT,ONGUSDT,-5.38624,3.019534e-05,90.142973
411,NANOUSDT,ONGUSDT,-5.143551,8.88941e-05,91.329823
439,NULSUSDT,ONTUSDT,-5.751204,5.477745e-06,92.929736
