# Initial check for CryptoQuant data
   * tasks
       * classify tables 
       * classify data types
       * build an initial data loader
   * data structure
       * instrument_list ['btc', 'stablecoin', 'erc20', 'eth']
       * exchange_list = ['coinbase_pro', 'derivative_exchange', 'deribit', 'binance', 'all_exchange', 'spot_exchange']
       * datatype (number of tales)
           * exchange-flows (5)
           * flow-indicator (5)
           * Market Indicator (3)
           * Network Indicator (5)
           * miner-flows (3)
           * Bank Flows (0)
           * Inter Entity Flows (1)
           * Fund Data (1)
           * market data (5)
           * network data (4)

In [None]:
import pandas as pd
from croqr.common.config import LOCAL_DATA_DIR
import os
from croqr.data.cq.config import CryptoQuantData
from datetime import datetime
import matplotlib.pyplot as plt
from croqr.common.utils import timeit

## check pickel files 

In [None]:
files = ['{}.pkl'.format(x) for x in ['cq','cq1','cq2']]
files

In [None]:
features = []
features0 = []
features1 = []
features2 = []
for file in files:
    display(file, '------------------------------------------------------------------------')
    df = pd.read_pickle(os.path.join(LOCAL_DATA_DIR, file))
    #display(df.keys(df.keys()))
    features.append(list(df.keys()))
    display(len(df.keys()))
    display(len(set(df.keys())))

In [None]:
features[1]

In [None]:
[x for x in features[1] if x not in features[0]]

## check file name

In [None]:
all_features = [x for x in features[2] if x not in features[1]]

In [None]:
all_features_seps = [x.split('-') for x in all_features]

In [None]:
instruments = list(set([x[0] for x in all_features_seps ]))
instruments

In [None]:
exchanges =  list(set([x[1] for x in all_features_seps ]))
exchanges

In [None]:
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

In [None]:
def get_cq_feature_name_datatype_part(feature_name):
    pos = find_nth(feature_name, '-',2)
    return feature_name[pos+1:]

In [None]:
get_cq_feature_name_datatype_part('btc-all_exchange-market-data-price-usd')

In [None]:
data_type_list = sorted(list(set([get_cq_feature_name_datatype_part(x) for x in all_features ])))

### quick check each feature/table

In [None]:
data_dict = pd.read_pickle(os.path.join(LOCAL_DATA_DIR, 'cqerc20.pkl'))

In [None]:
all_features = list(data_dict.keys())

In [None]:
for data_type in CryptoQuantData.data_type_list:
    for feature in all_features:
        if data_type in feature:
            display(feature)
            display(data_dict[feature].head())
            break

### Calcualted correlation between features and returns
   * for each feature, get the change rate of of the feature numbers
   * calcualte corr between change on features v.s. change on close price (ret)
   * try different time scale, sampling frequency, 1Min, 5Min
   * try different lag
   * rolling historical corr
 

In [None]:
btc_close = df['btc-all_exchange-market-data-price-usd']['price_usd_close'][::-1]

In [None]:
btc_close

In [None]:
btc_close.tail(10000)

In [None]:
btc_ret = btc_close.tail(10000).pct_change()

In [None]:
all_time_index_1min = btc_close.tail(10000).index

In [None]:
all_time_index_1min

In [None]:
exchangeflows = df['btc-all_exchange-exchange-flows-inflow']
exchangeflows

In [None]:
exchangeflows = exchangeflows[::-1].reset_index().tail(10000)

In [None]:
exchangeflows['signal_time'] = exchangeflows['datetime'].apply(lambda x: x.ceil('min'))

In [None]:
exchangeflows

In [None]:
feature_list = ['inflow_total','inflow_top10','inflow_mean']

In [None]:
exchangeflows_f = exchangeflows[feature_list + ['signal_time']].set_index('signal_time')

In [None]:
exchangeflows_f = exchangeflows_f[~exchangeflows_f.index.duplicated(keep='first')]

In [None]:
exchangeflows_f = exchangeflows_f.reindex(index=all_time_index_1min).ffill()

In [None]:
exchangeflows_f.shape

In [None]:

feature = 'inflow_total'
f_chg = exchangeflows_f['inflow_total'].pct_change().ffill(0)


In [None]:
look_back_window = 60*24
f_chg.rolling(look_back_window).corr(btc_ret).plot()
f_chg.shift(5).rolling(look_back_window).corr(btc_ret).plot()
f_chg.shift(10).rolling(look_back_window).corr(btc_ret).plot()
f_chg.shift(20).rolling(look_back_window).corr(btc_ret).plot()
f_chg.shift(30).rolling(look_back_window).corr(btc_ret).plot()
plt.legend(['lag=0','lag=5','lag=10','lag=20','lag=30'])
plt.title(feature)
plt.show()

In [None]:
for feature in feature_list:
    f_chg = exchangeflows_f['inflow_total'].pct_change().ffill(0)
    look_back_window = 60*24
    f_chg.rolling(look_back_window).corr(btc_ret).plot()
    f_chg.shift(5).rolling(look_back_window).corr(btc_ret).plot()
    f_chg.shift(10).rolling(look_back_window).corr(btc_ret).plot()
    f_chg.shift(20).rolling(look_back_window).corr(btc_ret).plot()
    f_chg.shift(30).rolling(look_back_window).corr(btc_ret).plot()
    plt.legend(['lag=0','lag=5','lag=10','lag=20','lag=30'])
    plt.title(feature)
    plt.show()

In [None]:
@timeit
def align_feature_df(raw_feature_df, feature_list, ret_df):
    # reverse time index
    feature_df = raw_feature_df[::-1].reset_index()
    
    # get signal_time
    feature_df['signal_time'] = feature_df['datetime'].apply(lambda x: x.ceil('min'))
    
    # select revelant columns
    exchangeflows_f = exchangeflows[feature_list + ['signal_time']].set_index('signal_time')
    
    # drop duplicated index
    feature_df = feature_df[~feature_df.index.duplicated(keep='first')]
    
    # align to ret dataframe
    feature_df = feature_df.reindex(index=ret_df.index).ffill()
    
    return aligned_feature_df

In [None]:
@timeit
def get_feature_df_corr_with_ret(df_features, ret_df, look_back_window = 60*24):
    feature_list = df_features.columns
    df_features_chg = df_features.pct_change().ffill(0)
    for feature in feature_list:
        f_chg = df_features.pct_change().ffill(0)
        f_chg.rolling(look_back_window).corr(btc_ret).plot()
        f_chg.shift(5).rolling(look_back_window).corr(btc_ret).plot()
        f_chg.shift(10).rolling(look_back_window).corr(btc_ret).plot()
        f_chg.shift(20).rolling(look_back_window).corr(btc_ret).plot()
        f_chg.shift(30).rolling(look_back_window).corr(btc_ret).plot()
        plt.legend(['lag=0','lag=5','lag=10','lag=20','lag=30'])
        plt.title(feature)
        plt.show()

In [None]:
data_dict = pd.read_pickle(os.path.join(LOCAL_DATA_DIR, 'cqerc20.pkl'))