In [None]:
import pandas as pd
import polars as pl
import numpy as np
from lightgbm import LGBMRegressor
from tqdm import tqdm
import matplotlib.pyplot as plt

# import sys
# sys.path.insert(0, './input/g-research-crypto-forecasting')
# import gresearch_crypto

import sys
sys.path.insert(0, '../tyche')
import tool; from importlib import reload; reload(tool)
from tool import Tool

In [None]:
# kaggle data process
def kaggle_change_col_name(df):
    names = {
        'Open': 'open',
        'High': 'high',
        'Low': 'low',
        'Close': 'close',
        'Volume': 'volume',
        'VWAP': 'vwap',
        'Count': 'count'
    }
    df = df.rename(names)
    return df

def change_col_name(df):
    names = {
        'volccy': 'volume',
    }
    df = df.rename(names)
    df = df.with_columns(pl.col('volume').cast(pl.Float64))
    return df

def kaggle_fix_data(df, all_timestamps=None):
    if(all_timestamps is None):
        start_time=1514764860000
        end_time=1632182400000
        all_timestamps = range(start_time, end_time, 60000)
    full_df = pl.DataFrame({
        'timestamp': all_timestamps
    })
    df = df.join(full_df, on='timestamp', how='outer')
    df = df.with_columns(pl.col('timestamp_right').alias('timestamp'))
    return df

def ret(df):
    for _ in [i*60 for i in [2, 5, 15, 30, 60, 120, 240, 480, 720, 1440, 2880, 10080]]:
        lb = _ // 60
        # df = df.with_columns(
        #     ((pl.col('close').shift(-lb-1) / pl.col('close').shift(-1)) - 1).alias(f'ret_{_}s')
        # )
        df = df.with_columns(((pl.col('close').shift(-lb-1)/pl.col('close').shift(-1)).log()).alias(f'ret_{_}s')) 
    return df

In [None]:
#load kaggle data
def load_kaggle_kline():
    kaggle_dfs = {}
    TRAIN_CSV = './input/g-research-crypto-forecasting/train.csv'
    ASSET_DETAILS_CSV = './input/g-research-crypto-forecasting/asset_details.csv'
    data_df = pl.read_csv(TRAIN_CSV)
    asset_df = pl.read_csv(ASSET_DETAILS_CSV)
    asset_df = asset_df.sort('Asset_ID')
    assets = asset_df['Asset_ID'].to_numpy()
    asset_2_symbol = dict(zip(asset_df['Asset_ID'], asset_df['Symbol']))
    symbol_2_wight = dict(zip(asset_df['Symbol'], asset_df['Weight']))

    for asset_id in tqdm(assets):
        symbol = asset_2_symbol[asset_id]
        df = data_df.filter(pl.col('Asset_ID') == asset_id)

        df = kaggle_change_col_name(df)
        df = df.with_columns((pl.col('timestamp')*1000).alias('timestamp'))
        df = df.sort('timestamp')
        df = kaggle_fix_data(df)

        df = df.with_columns((pl.col('timestamp')*1000).cast(pl.Datetime).alias('time'))
        df = df.sort('timestamp')
        df = df.with_columns(pl.lit(symbol).alias('symbol'))

        df = ret(df)
        kaggle_dfs[symbol] = df
    return kaggle_dfs, asset_df


def load_kline(symbols, file_root , file_names):
    dfs = {}
    for symbol in tqdm(symbols):
        df = []
        for file_name in file_names:
            file_path = f"{file_root}/{file_name}/OKEX_UM_1m/{symbol}-USDT-SWAP.csv"
            _ = pl.read_csv(file_path, dtypes={'vol': pl.Float64, 'volccy':pl.Float64,  'volccy_qoute': pl.Float64})
            _ = _.fill_null(0)
            if (_.shape[0] == 0):
                continue
            df.append(_)
        df = pl.concat(df)
        df = change_col_name(df)
        df = df.group_by(df.columns).first()
        df = df.with_columns((pl.col('timestamp')*1000).cast(pl.Datetime).alias('time'))
        df = df.sort('timestamp')
        df = df.with_columns(pl.lit(symbol).alias('symbol'))

        # 生成Target
        df = ret(df)
        df = df.fill_null(0)
        df = df.fill_nan(0)
        dfs[symbol] = df
        # print(symbol, (df.filter((pl.col('timestamp').diff()!=60000) | (pl.col('timestamp').diff(-1)!=-60000))).shape)
    return dfs



In [None]:
def get_clip_target(df, feature):
    p = 0.01
    df = df.with_columns(
        pl.when(pl.col(feature) < -p ).then(
            -p).otherwise(pl.col(feature)).alias(f"{feature}_clip")
    )
    df = df.with_columns(
        pl.when(pl.col(f"{feature}_clip") > p).then(
            p).otherwise(pl.col(f"{feature}_clip")).alias(f"{feature}_clip")
    )

    return df

def gf_SM_A_M(df, feature, lb):
    df = df.with_columns(df.rolling(index_column="time", period=lb).agg([
            pl.col(feature).mean().alias(f'bl.{feature}_mean_{lb}'),
            pl.col(feature).std().alias(f'bl.{feature}_std_{lb}'),
            pl.col(feature).median().alias(f'bl.{feature}_median_{lb}'),
        ]
    ))
    return df

def gf_EMA(df, feature, n):
    alpha = 2 / (n + 1)
    df = df.with_columns(
        pl.col(feature).ewm_mean(alpha=alpha).alias(f'bl.{feature}_{n}_ema')
    )
    return df

def gf_MACD(df, feature, span1=12, span2=26, span3=9):
    df = gf_EMA(df, feature, span1)
    df = gf_EMA(df, feature, span2)
    df = df.with_columns(((pl.col(f'bl.{feature}_{span1}_ema') / pl.col(f'bl.{feature}_{span2}_ema') - 1) * 100).alias(f'bl.{feature}_macd'))
    df = gf_EMA(df, f'bl.{feature}_macd', span3)
    return df

def gf_Bollinger_Bands(df, feature, lb, no_of_std):
    df = gf_SM_A_M(df, feature, lb)
    df = df.with_columns(
        [
            (pl.col(f'bl.{feature}_mean_{lb}') + no_of_std * pl.col(f'bl.{feature}_std_{lb}')).alias(f'bl.{feature}_upper_band_{lb}'),
            (pl.col(f'bl.{feature}_mean_{lb}') - no_of_std * pl.col(f'bl.{feature}_std_{lb}')).alias(f'bl.{feature}_lower_band_{lb}'),
        ]
    )
    return df    

def gf_RSI(df, feature, lb):
    df = df.with_columns(
        pl.col(feature).diff().fill_nan(0).alias("tp.delta")
    )
    df = df.with_columns(
        pl.when(pl.col("tp.delta") > 0).then(pl.col("tp.delta")).otherwise(0).alias("tp.up"),
        pl.when(pl.col("tp.delta") < 0).then(-pl.col("tp.delta")).otherwise(0).alias("tp.down")
    )

    df = df.with_columns(
        df.rolling(index_column="time", period=lb).agg(
            [
                pl.col("tp.up").mean().alias("tp.up_mean"),
                pl.col("tp.down").mean().alias("tp.down_mean"),
            ]
        )
    )

    df = df.with_columns(
        (pl.col("tp.up_mean") / pl.col("tp.down_mean")).fill_nan(0).fill_null(0).alias("tp.rs")
    )
    df = df.with_columns(
        (100 - 100 / (1 + pl.col("tp.rs"))).fill_nan(0).fill_null(0).alias(f'bl.{feature}_rsi_{lb}')
    )

    return df

def gf_alpha_RSI(df, feature, lb):
    df = gf_RSI(df, feature, lb)
    df = df.with_columns((pl.col(f'bl.{feature}_rsi_{lb}') / 100 * 2 - 1).fill_nan(0).fill_null(0).alias(f'al.{feature}_rsi_{lb}'))
    return df

def gf_alpha_back_ret(df, feature, n):
    df = df.with_columns(
        ((pl.col(feature) - pl.col(feature).shift(n))/ (pl.col(feature) + pl.col(feature).shift(n))).fill_nan(0).fill_null(0).alias(f'al.{feature}_ret_{n*60}s')
    )
    return df

def gf_alpha_sma(df, feature, lb):
    df = gf_SM_A_M(df, feature, lb)
    df = df.with_columns(((pl.col(f'bl.{feature}_mean_{lb}') - pl.col(feature))/(pl.col(f'bl.{feature}_mean_{lb}') + pl.col(feature))).fill_nan(0).fill_null(0).alias(f'al.{feature}_sma_{lb}'))
    return df


def calc_slope(x):
    slope = np.polyfit(range(len(x)), x, 1)[0]
    return slope

def gf_time(df):
    df = df.with_columns(
        [
            pl.col("time").dt.hour().alias("bl.timeofhour"),
            pl.col("time").dt.weekday().alias("bl.timeofweek"),
        ]    
    )
    return df


def gf_alpha_shadow(df, lb):
    df = df.with_columns(df.rolling(index_column="time", period=lb).agg(
        [
            pl.col('open').first().alias(f'bl.open_{lb}'),
            pl.col('high').max().alias(f'bl.high_{lb}'),
            pl.col('low').min().alias(f'bl.low_{lb}'),
            pl.col('close').last().alias(f'bl.close_{lb}'),
            pl.col('volume').sum().alias(f'bl.volume_{lb}'),
        ]
    ))

    df = df.with_columns(
        [ 
            (pl.col(f'bl.high_{lb}') - pl.max_horizontal(pl.col(f'bl.close_{lb}'), pl.col(f'bl.open_{lb}'))).alias(f'bl.upper_shadow_{lb}'),
            (pl.min_horizontal(pl.col(f'bl.close_{lb}'), pl.col(f'bl.open_{lb}')) - pl.col(f'bl.low_{lb}')).alias(f'bl.lower_shadow_{lb}'),
            (pl.col(f'bl.high_{lb}') - pl.col(f'bl.close_{lb}')).alias(f'bl.high_close_{lb}'),
            (pl.col(f'bl.close_{lb}') - pl.col(f'bl.low_{lb}')).alias(f'bl.close_low_{lb}'),
            (pl.col(f'bl.high_{lb}') - pl.col(f'bl.low_{lb}')).alias(f'bl.high_low_{lb}'),
            (pl.col(f'bl.close_{lb}') - pl.col(f'bl.open_{lb}')).alias(f'bl.close_open_{lb}'),
        ]
    )  

    df = df.with_columns(
        [
            ((pl.col(f'bl.upper_shadow_{lb}') - pl.col(f'bl.lower_shadow_{lb}')) /  (pl.col(f'bl.upper_shadow_{lb}') + pl.col(f'bl.lower_shadow_{lb}'))).fill_nan(0).alias(f'al.shadow_pressure_{lb}'),
            ((pl.col(f'bl.high_close_{lb}') - pl.col(f'bl.close_low_{lb}')) / pl.col('close')).fill_nan(0).alias(f'al.from_high_low_{lb}'),
        ]
    )

    return df

# Feature Engineering
def get_features(df):
    # df = df.with_row_index("index")
    df = df.sort('time')
    LOOK_BACK_LIST = ['1m', '5m', '15m', '30m', '1h', '3h']
    LOOK_BACK_LIST2 = [1, 5, 15, 30, 60, 180]

    for lb in LOOK_BACK_LIST:
        df = gf_alpha_shadow(df, lb)
        df = gf_alpha_RSI(df, 'close', lb)
        df = gf_alpha_sma(df, 'close', lb)

    for n in LOOK_BACK_LIST2:
        df = gf_alpha_back_ret(df, 'close', n)
        df = gf_alpha_back_ret(df, 'volume', n)

    df = gf_time(df)
    df = gf_MACD(df, 'close')
    for lb in LOOK_BACK_LIST:
        df = gf_SM_A_M(df, 'close', lb)
        df = gf_SM_A_M(df, 'volume', lb)
        df = gf_Bollinger_Bands(df, 'close', lb, 2.5)

    for n in LOOK_BACK_LIST2:
        df = gf_EMA(df, 'close', n)
        df = gf_EMA(df, 'volume', n)

    return df




In [None]:
def weighted_correlation(a, b, weights):
    w = np.ravel(weights)
    a = np.ravel(a)
    b = np.ravel(b)

    sum_w = np.sum(w)
    mean_a = np.sum(a * w) / sum_w
    mean_b = np.sum(b * w) / sum_w
    var_a = np.sum(w * np.square(a - mean_a)) / sum_w
    var_b = np.sum(w * np.square(b - mean_b)) / sum_w

    cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return corr

def total_corr(res, symbol_2_wight):
    a = []
    b = []
    w = []
    for symbol in symbols:
        a.append(res[symbol][0])
        b.append(res[symbol][1])
        w.append(np.ones(len(res[symbol][0]))*symbol_2_wight[symbol])
    a = np.concatenate(a)
    b = np.concatenate(b)
    w = np.concatenate(w)
    corr = weighted_correlation(a,b,w)
    # print(f"TOTAL TEST CORR: {corr:.4f}")
    return corr

def get_augment_data(df, select_features, targets):
    for feature in select_features:
        df = df.with_columns(-pl.col(feature).alias(f'{feature}'))
    for target in targets:
        df = df.with_columns(-pl.col(target).alias(f'{target}'))
    return df
    
def fitting(train_df, test_df, target, model, data_select=True, data_augment=False):
    clip_target = f'{target}_clip'
    res = {}
    test_dfs = {}
    
    train_df = train_df.sort('time')
    features = [i for i in train_df.columns if 'al.' in i] # + [i for i in df.columns if 'bl.' in i]
    if(data_select):
        select_features = features # + ['open', 'high', 'low', 'close', 'volume']
    else:
        select_features = features

    # train_df = train_df.fill_null(0)
    # train_df = train_df.fill_nan(0)

    # test_df = test_df.fill_null(0)
    # test_df = test_df.fill_nan(0)

    # for feature in select_features:
    #     train_df = train_df.with_columns(pl.when(pl.col(feature).abs() == np.inf).then(0).otherwise(pl.col(feature)).alias(feature))
    #     test_df = test_df.with_columns(pl.when(pl.col(feature).abs() == np.inf).then(0).otherwise(pl.col(feature)).alias(feature))

    if(data_augment):
        train_df_augment = get_augment_data(
            train_df, select_features, [target, clip_target])
        train_df_total = pl.concat([train_df, train_df_augment])
    else:
        train_df_total = train_df

    # model = LGBMRegressor(num_leaves=5, max_depth=3, n_estimators=20)
    # model = LGBMRegressor(num_leaves=8, max_depth=4, n_estimators=50)
    # model = LGBMRegressor(num_leaves=10, max_depth=5, n_estimators=100)
    model.fit(train_df_total[select_features], train_df_total[target])

    train_df = train_df.with_columns(pl.Series('pred', model.predict(train_df[select_features])))
    test_df = test_df.with_columns(pl.Series('pred', model.predict(test_df[select_features])))

    symbols = test_df['symbol'].unique().sort()

    for symbol in symbols:
        train_df_new = train_df.filter(pl.col('symbol') == symbol)
        test_df_new = test_df.filter(pl.col('symbol') == symbol)
        print(symbol)
        Tool.evaluation(train_df_new['pred'].to_numpy(),train_df_new[target].to_numpy())
        Tool.evaluation(test_df_new['pred'].to_numpy(), test_df_new[target].to_numpy())
        # train_corr = weighted_correlation(train_df_new['pred'].to_numpy(),train_df_new[target].to_numpy(), np.ones(train_df_new['pred'].shape[0]))
        # test_corr = weighted_correlation(test_df_new['pred'].to_numpy(), test_df_new[target].to_numpy(), np.ones(test_df_new['pred'].shape[0]))
        # print(f"TRAIN CORR: {train_corr:.4f}, TEST CORR: {test_corr:.4f}")
        # Tool.show_single_feature(train_df_new, "FR", select_features,'pred', target, row_peroid=train_df_new.shape[0]//1, point_period=600, output_path=f"./output/Train_{symbol}.html")
        # Tool.show_single_feature(test_df_new, "FR", select_features,'pred', target, row_peroid=test_df_new.shape[0]//1, point_period=600, output_path=f"./output/Test_{symbol}.html")
        
        res[symbol] = [test_df_new['pred'].to_numpy(), test_df_new[target].to_numpy()]
        # plt.scatter(test_df[target], test_df['pred'], label='pred', alpha=0.1, s=1)
        # plt.show()

        test_dfs[symbol] = test_df_new
    return res, test_dfs
        
def process_df(df, target):
    df = df.filter(pl.col(target).is_null() == False)
    df = df.filter(pl.col(target).is_nan() == False)
    df = df.filter(pl.col('close').is_null() == False)
    df = df.filter(pl.col('time').is_null() == False)
    df = get_features(df)
    df = get_clip_target(df, target)
    features = [i for i in df.columns if 'al.' in i]
    features.sort()
    # Tool.show_feature_distribution(df, features)
    return df
    
def select_data(df, start_time, end_time):
    return df.filter((pl.col('time') >= start_time) & (pl.col('time') < end_time))


def get_kaggle_target(dfs, asset_df):
    
    symbol_2_wight = dict(zip(asset_df['Symbol'], asset_df['Weight']))
    m = np.zeros(dfs['BTC'].shape[0])

    for symbol in tqdm(symbol_2_wight):
        w = symbol_2_wight[symbol]
        df = dfs[symbol].with_columns((pl.col('ret_900s') * w).alias('m'))
        m = m + np.nan_to_num(df['m'].to_numpy()) / asset_df['Weight'].sum()

    for symbol in tqdm(symbol_2_wight):
        df = dfs[symbol].with_columns(pl.lit(m).alias('m'))
        df = df.with_columns(pl.col('m').fill_null(0).fill_nan(0).alias('m'))
        df = df.with_columns((pl.col('m') * pl.col('ret_900s')).fill_nan(0).fill_null(0).alias('rm'))
        df = df.with_columns((pl.col('m') * pl.col('m')).alias('mm'))
        df = df.with_columns((pl.col("rm").rolling_sum(window_size=3750) / pl.col("mm").rolling_sum(window_size=3750)).alias('beta'))
        # df = df.with_columns(pl.col('beta').fill_null(0).alias('beta'))
        df = df.with_columns((pl.col('ret_900s') - pl.col('beta') * pl.col('m')).alias('RE_Target'))
        # print(symbol)
        # print(df.filter(pl.col('Target').is_null() == False).shape)
        # print(df.filter(pl.col('Target1').is_null() == False).shape)
        # print(df.filter(pl.col('ret_900s').is_null() == False).shape)
        # print(df.filter((pl.col('Target') - pl.col('Target1')).abs()>0.0001).shape[0] / df.shape[0])
        dfs[symbol] = df
        print(df.shape)
    return dfs

In [None]:
kaggle_dfs, asset_df = load_kaggle_kline()
asset_2_symbol = dict(zip(asset_df['Asset_ID'], asset_df['Symbol']))
symbol_2_wight = dict(zip(asset_df['Symbol'], asset_df['Weight']))

symbols = asset_df['Symbol'].to_numpy()
symbols.sort()
kaggle_dfs = get_kaggle_target(kaggle_dfs, asset_df)

# symbols = symbols[symbols != 'XMR'] 
# symbols = pl.read_csv("../panel/kline/symbols.csv")['symbol'].to_list()
# file_root = "..//panel/kline/"
# file_names = ["update_okex_202404", "update_okex_202403","update_okex_2022_2023", "update_okex_2021_2022", "update_okex_2020_2021"]
# dfs = load_kline(symbols[symbols != 'XMR'], file_root, file_names)

In [None]:
from datetime import datetime
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

target = 'ret_900s'
kaggle_df_list = []
kaggle_df_dict = {}

for symbol in tqdm(symbols):
    kaggle_df = process_df(kaggle_dfs[symbol].clone(), target)
    kaggle_df_list.append(kaggle_df)
    kaggle_df_dict[symbol] = kaggle_df
    
kaggle_df = pl.concat(kaggle_df_list)


In [None]:
target = 'ret_900s'
# target = 'Target'
# target = 'RE_Target'
# COMBINE = True
COMBINE = True
JOIN_FEATURE = True

select_df = kaggle_df.clone()
if(JOIN_FEATURE):
    features = [i for i in kaggle_df.columns if 'al.' in i]
    for symbol in ['BTC', 'ETH']:
        df = kaggle_df_dict[symbol][features + ['timestamp']]
        df = df.rename({i: f"{i}.{symbol}" for i in features})
        select_df = select_df.join(df, on='timestamp', how='left')
select_df = select_df.filter((pl.col(target).is_nan() == False) & (pl.col(target).is_null() == False))


In [None]:
# 3year 9month never rolling

COMBINE = True
JOIN_FEATURE = True

start_date = kaggle_df['time'].min()
end_date = kaggle_df['time'].max()

start_time = start_date
mid_time = start_date + relativedelta(years=3)
end_time = end_date
train_df = select_data(select_df, start_time, mid_time)
train_df.drop_nulls()
test_df = select_data(select_df, mid_time, end_time)
model = LGBMRegressor(num_leaves=10, max_depth=5, n_estimators=100)
# model = Lasso(alpha=0.000001) 
# model = LinearRegression()
test_dfs = []
if(COMBINE):
    res, tmp_dfs = fitting(train_df, test_df, target, model, data_select=True, data_augment=False)
else:
    res = {}
    for symbol in symbols:
        _, tmp_dfs = fitting(train_df.filter(pl.col("symbol") == symbol), test_df.filter(pl.col("symbol") == symbol), target, model, data_select=True, data_augment=True)
        res[symbol] = _[symbol]


corr = total_corr(res, symbol_2_wight)
print(f"TEST CORR: {corr:.4f}")



In [None]:
#Rolling
from sklearn.linear_model import LinearRegression, Lasso
TRAIN_MONTH = 36
ROLLING_MONTH = 1
SKIP_MONTH = 36 - TRAIN_MONTH

start_date = select_df['time'].min()
end_date = select_df['time'].max()
start_time = start_date + relativedelta(months=SKIP_MONTH)
mid_time = start_time + relativedelta(months=TRAIN_MONTH)
end_time = mid_time + relativedelta(months=ROLLING_MONTH)

corrs = []
ress = []
test_dfs = []
while mid_time < end_date:
    print(start_time, mid_time, end_time)
    train_df = select_data(select_df, start_time, mid_time)
    train_df.drop_nulls()
    test_df = select_data(select_df, mid_time, end_time)
    model = LGBMRegressor(num_leaves=10, max_depth=5, n_estimators=100)
    # model = LinearRegression()
    # model = Lasso(alpha=0.000001) 
    if(COMBINE):
        res, tmp_dfs = fitting(train_df, test_df, target, model, data_select=True, data_augment=False)
        test_dfs.append(tmp_dfs)
    else:
        res = {}
        for symbol in symbols:
            _, tmp_dfs = fitting(train_df.filter(pl.col("symbol") == symbol).fill_nan(0), test_df.filter(pl.col("symbol") == symbol).fill_nan(0), target, model, data_select=True, data_augment=False)
            res[symbol] = _[symbol]
    corr = total_corr(res, symbol_2_wight)
    ress.append(res)
    print(f"TEST CORR: {corr:.4f}")
    corrs.append(corr)
    start_time = start_time + relativedelta(months=1)
    mid_time = mid_time + relativedelta(months=1)
    end_time = min(end_time + relativedelta(months=1), end_date)

res = {}
for i in ress:
    for k, v in i.items():
        if(k not in res):
            res[k] = []
        res[k].append(v)
for k in res:
    res[k] = np.concatenate(res[k], axis=1)
corr = total_corr(res, symbol_2_wight)
print(f"TEST CORR: {corr:.4f}")

In [None]:
res = {}
for i in ress:
    r = {}
    for k, v in i.items():
        if(k not in res):
            res[k] = []
        res[k].append(v)
        r[k] = v
    corr = total_corr(r, symbol_2_wight)
    print(f"TEST CORR: {corr:.4f}")

for k in res:
    res[k] = np.concatenate(res[k], axis=1)
corr = total_corr(res, symbol_2_wight)
print(f"TEST CORR: {corr:.4f}")

In [None]:
number = 0
number2 = 0
miss_data = {}
total_data = {}
for symbol in kaggle_dfs:
    df = kaggle_dfs[symbol].filter(pl.col('Target').is_null() == False)
    number += df.shape[0]
    number2 += (df['timestamp'].max() - df['timestamp'].min())//60000 + 1 - df.shape[0]
    miss_data[symbol] = (df['timestamp'].max() - df['timestamp'].min())//60000 + 1 - df.shape[0]
    total_data[symbol] = df.shape[0]
print(number, number2 / number)

import matplotlib.pyplot as plt

data_counts = {symbol: data for symbol, data in total_data.items()}
missing_counts = {symbol: data for symbol, data in miss_data.items()}

plt.figure(figsize=(10, 6))

plt.bar(data_counts.keys(), data_counts.values(), label='Data Count')
plt.bar(missing_counts.keys(), missing_counts.values(), bottom=list(data_counts.values()), label='Missing Count')

plt.title('Data and Missing Counts for Each Asset')
plt.xlabel('Asset')
plt.ylabel('Count')

plt.legend()

plt.show()

Tool.show_single_feature(kaggle_dfs['BTC'][1000:], '', [], row_peroid=kaggle_dfs['BTC'][1000:].shape[0], point_period=30*24*60)# output_path=f"./output/BTC.html")