# Kalman Filter correlation search

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
import statsmodels.tsa.vector_ar.vecm as vm

from IPython.display import clear_output

In [2]:
symbols = list(pd.read_csv("../data/testing-hourly/0-pairs.csv", squeeze=True))

In [47]:
df1 = pd.read_csv("../data/testing-hourly/ZECUSDT-hour.csv", index_col=0, parse_dates=True)
df2 = pd.read_csv("../data/testing-hourly/XMRUSDT-hour.csv", index_col=0, parse_dates=True)

In [3]:
stats = pd.DataFrame(columns=["A", "B", "t", "p", "h"])

In [6]:
loc = symbols

for a in loc:
    for b in loc[loc.index(a)+1:]:
        clear_output()
        print(f"========== {a} and {b} ==========")
        df1 = pd.read_csv(f"../data/testing-hourly/{a}-hour.csv", index_col=0, parse_dates=True)
        if len(df1) > 20000 or len(df1)<10000:
            continue
        df2 = pd.read_csv(f"../data/testing-hourly/{b}-hour.csv", index_col=0, parse_dates=True)
        if len(df2) > 20000 or len(df2)<10000:
            continue
        df = df1.close.rename("A").to_frame()
        df["B"] = df2.close
        df = df.dropna()
        df = df[1000:]
        # cadf test
        coint_t, pvalue, crit_value = ts.coint(df['B'], df['A'])
        print(f"t-statistic={round(coint_t, 2)}, pvalue={round(pvalue, 2)}, crits={crit_value}")

        result = vm.coint_johansen(df[['A', 'B']].values, det_order=0, k_ar_diff=1)
        yport = pd.DataFrame(np.dot(df.values, result.evec[:, 0]))  # (net) market value of portfolio
        ylag = yport.shift()
        deltaY = yport - ylag
        df2 = pd.concat([ylag, deltaY], axis=1)
        df2.columns = ['ylag', 'deltaY']
        regress_results = sm.ols(formula="deltaY ~ ylag", data=df2).fit()
        halflife = -np.log(2) / regress_results.params['ylag']
        stats = stats.append({"A":a, "B":b, "t":coint_t, "p":pvalue, "h":halflife}, ignore_index=True)
        print(f'halflife={round(halflife, 0)} days')
        clear_output()
stats



Unnamed: 0,A,B,t,p,h
0,AIONUSDT,ALGOUSDT,-3.000905,0.109991,259.103964
1,AIONUSDT,ANKRUSDT,-3.716908,0.017428,153.546312
2,AIONUSDT,ARDRUSDT,-4.746055,0.000469,54.963380
3,AIONUSDT,ARPAUSDT,-4.319891,0.002384,97.922426
4,AIONUSDT,ATOMUSDT,-2.603922,0.235252,245.652174
...,...,...,...,...,...
2140,WINUSDT,WTCUSDT,-4.287435,0.002678,155.949472
2141,WINUSDT,XTZUSDT,-4.219123,0.003412,218.758035
2142,WRXUSDT,WTCUSDT,-2.839588,0.153413,202.633699
2143,WRXUSDT,XTZUSDT,-3.402174,0.042110,193.428475


In [11]:
stats.to_csv("../data/testing-hourly/0-stats-short.csv", index=False)

In [9]:
stats = stats[stats.A != "BUSDUSDT"]
stats = stats[stats.B != "BUSDUSDT"]

In [10]:
stats.sort_values("h")[:20]

Unnamed: 0,A,B,t,p,h
1894,MITHUSDT,NPXSUSDT,-5.158836,8.316443e-05,-287.369433
1970,NPXSUSDT,WINUSDT,-0.55235,0.960389,-45.538941
1971,NPXSUSDT,WRXUSDT,7.773635,1.0,-41.552263
2139,WINUSDT,WRXUSDT,-5.242792,5.748465e-05,25.22977
1561,GTOUSDT,MITHUSDT,-8.140074,1.3418e-11,26.715229
1321,DREPUSDT,WINUSDT,-13.090466,1.846776e-23,28.139002
249,ARDRUSDT,WANUSDT,-7.255326,2.090812e-09,32.125883
1783,LSKUSDT,TCTUSDT,-7.595086,3.083587e-10,33.135142
206,ARDRUSDT,CTXCUSDT,-5.520683,1.628224e-05,33.169403
1322,DREPUSDT,WRXUSDT,-9.740095,1.127896e-15,34.215237


In [12]:
stats.sort_values("h")[20:40]

Unnamed: 0,A,B,t,p,h
1952,NKNUSDT,WRXUSDT,-8.074652,1.961354e-11,40.568023
17,AIONUSDT,CTXCUSDT,-7.146749,3.823091e-09,40.764996
1059,CTXCUSDT,WANUSDT,-7.70426,1.654974e-10,41.946452
1747,KEYUSDT,NPXSUSDT,-7.291016,1.713022e-09,42.554862
223,ARDRUSDT,IOTXUSDT,-5.887418,2.828822e-06,43.760857
1302,DREPUSDT,MITHUSDT,-7.622141,2.643692e-10,44.155615
243,ARDRUSDT,TCTUSDT,-5.71127,6.633075e-06,46.484418
1053,CTXCUSDT,TCTUSDT,-6.365579,2.537582e-07,46.490509
226,ARDRUSDT,LSKUSDT,-4.677998,0.0006152211,47.390261
54,AIONUSDT,TCTUSDT,-5.923837,2.365806e-06,48.101489
