In [1]:
from binance_historical_data import BinanceDataDumper
import pandas as pd
import pathlib
from tqdm import tqdm


In [3]:
data_dumper = BinanceDataDumper(
    path_dir_where_to_dump = '../data/',
    asset_class = 'spot',
    data_type = 'klines',
    data_frequency = '1m'
)
x = data_dumper.dump_data(
    tickers = ['BTCUSDT'],
    date_start = None,
    date_end = None,
    is_to_update_existing = True,
    # tickers_to_exclude = ["UST"]
)

---> Found overall tickers: 2371
---> Filter to asked tickers: 1
------> Tickers left: 1
Download full data for 1 tickers: 
---> Data will be saved here: /Users/johnz/Library/CloudStorage/GoogleDrive-john23@berkeley.edu/My Drive/CryptoFutures/data/spot
---> Data Frequency: 1m
---> Start Date: 20170101
---> End Date: 20231201


Tickers:   0%|          | 0/1 [00:00<?, ?it/s]

monthly files to download:   0%|          | 0/76 [00:00<?, ?files/s]

daily files to download: 0files [00:00, ?files/s]

Tried to dump data for 1 tickers:
---> For BTCUSDT new data saved for: 75 months 0 days


In [2]:
def process_klines_data(columns, token_lists, time: str):
    """
    futures
    :param columns: for data columns
    :param token_lists: tokens to get data for
    :return:
    """
    path = pathlib.Path.cwd().parent / 'data' / "futures" / 'um' / 'monthly' / 'klines'
    for token in token_lists:
        try:
            files = path / token / f'{time}'
            csv_files = files.glob('*.csv')
            dfs = []
            for file in csv_files:
                df = pd.read_csv(file, index_col=None, names=columns)
                idx = df.index[df['open_time'] != 'open_time']
                df = df.loc[idx].reset_index(drop=True)
                for col in columns:
                    df[col] = df[col].astype(float)
                dfs.append(df)
            df_all = pd.concat(dfs, axis=0, ignore_index=True)
        except Exception as e:
            print(f'Error for {token}: {e}')
    return df_all

In [3]:
def process_klines_data_spot(columns, token : str, time: str):
    """
    futures
    :param columns: for data columns
    :param token: single token to get data for (ie: BTCUSDT)
    :return:
    """
    monthly_path = pathlib.Path.cwd().parent / 'data' / "spot" / 'monthly' / 'klines'
    daily_path = pathlib.Path.cwd().parent / 'data' / "spot" / 'daily' / 'klines'
    all_paths = [monthly_path, daily_path]
    # for token in token_lists:
    df_final = []
    for path in all_paths:
            try:
                files = path / token / f'{time}'
                csv_files = files.glob('*.csv')
                dfs = []
                for file in tqdm(csv_files):
                    df = pd.read_csv(file, index_col=None, names=columns)
                    idx = df.index[df['open_time'] == 'open_time']
                    if idx.shape[0] > 0:
                        print(f'Number of incorrect rows: {idx.shape} for {file}')
                    # df = df.loc[idx].reset_index(drop=True)
                    for col in columns:
                        df[col] = df[col].astype(float)
                    dfs.append(df)
                print(len(dfs))
                df_all = pd.concat(dfs, axis=0, ignore_index=True)
                df_final.append(df_all)
            except Exception as e:
                print(f'Error for {token}: {e}')
    df_final = pd.concat(df_final, axis = 0)
    df_final = df_final.drop_duplicates(subset = ['open_time'])
    return df_final

In [4]:
# columns = ['open_time', 'open', 'high', 'low', 'close', 'volume',
#            'close_time', 'quote_asset_volume', 'number_of_trades',
#            'taker_buy_base_volume', 'taker_buy_quote_asset_volume', 'ignore']
#
# df_all = process_klines_data_spot(columns, 'BTCUSDT')

In [5]:
from ta import add_all_ta_features
def load_klines_data(universe, columns, save = True, time = '1m'):
    dfs = {}
    sample = {}
    for token in tqdm(universe):
        df = process_klines_data_spot(columns, token, time)
        df['open_time'] = pd.to_datetime(df['open_time'], unit = 'ms')
        df['close_time'] = pd.to_datetime(df['close_time'], unit = 'ms')
        df['token'] = token
        df = df.sort_values(by = 'open_time', ignore_index = True)
        df = add_all_ta_features(df, open = 'open', high = 'high', low = 'low',
                                 close = 'close', volume = 'volume', fillna = True)
        if save:
            df.to_feather(f'../data/processed_data/{token}_{time}_spot.feather')
        dfs[token] = df
    return dfs

In [6]:
import pandas as pd
import pathlib
import numpy as np
# from factor_util import *
import pandas as pd
import pathlib
# from factor_util import *
from joblib import dump, load
import bittensor as bt

def hullMA(x, n = 50):
    sma1 = x.rolling(n,  min_periods = 1).mean()
    sma2 = x.rolling(int(n/2),  min_periods = 1).mean()
    out = (2 * sma1 - sma2).rolling(int(np.sqrt(n)), min_periods = 1).mean()
    return x - out

def calculate_corr(df, ta_features = None, columns = None, groupby = True):
    if ta_features is None:
        skip_features = ['returns_5m', 'open_time', 'close_time', 'target_15m', 'ignore', 'token']
        features = [x for x in df.columns if x not in skip_features]
        ta_features = [x for x in df.columns if x not in skip_features and x not in columns]
    if groupby:
        tgt_corr = df.groupby(['token'])[ta_features + ['target_15m']].corr()
    else:
        tgt_corr = df[ta_features + ['target_15m']].corr()
    return tgt_corr

def calculate_vol_price_corr(df, windows = [5, 15, 30, 60, 120]):
    for window in windows:
        df[f'vol_price_corr_{window}'] = df['close'].rolling(window, min_periods = 1).corr(df['volume'])
    return df

def get_cols_for_corr(df, str_idx):
    return df.columns[df.columns.str.startswith(str_idx)].tolist()

def transform_time(df):
    day = 24 * 60
    hour_float = df['open_time'].dt.hour + df['open_time'].dt.minute/60
    df['sin_hour'] = np.sin(2.0 * np.pi * hour_float/24)
    df['cos_hour'] = np.cos(2.0 * np.pi * hour_float/24)
    df['Day_sin'] = np.sin(df['open_time'].dt.day * (2 * np.pi / 31))
    df['Day_cos'] = np.cos(df['open_time'].dt.day * (2 * np.pi / 31))
    df['month_sin'] = np.sin(df['open_time'].dt.month * (2 * np.pi / 12))
    df['month_cos'] = np.cos(df['open_time'].dt.month * (2 * np.pi / 12))
    return df

def calc_sma_diff_test(close, timeperiod_short, timeperiod_long):
    res_short = close.rolling(window = timeperiod_short, min_periods = 1).mean()
    res_long = close.rolling(window = timeperiod_long, min_periods = 1).mean()
    res = (res_long - res_short) / res_long
    return res

def load_metrics_data(ticker):
    df_metrics = pd.read_feather(f'../data/processed_metrics/{ticker}_1m.feather')
    df_metrics['create_time'] = pd.to_datetime(df_metrics['create_time'], format = 'mixed')
    return df_metrics

def gen_cross_features(x, lag = 60):
    """
    calculate cross features with other assets if any
    :param x:
    :param lag:
    :return:
    """
    lag_arr = np.ones(lag)
    conv_arr = np.convolve(x, lag_arr / lag, mode = 'valid')
    app_arr = np.append(conv_arr, np.ones(lag - 1))
    roll_arr = np.roll(app_arr, lag - 1)
    div_arr = np.log(x / roll_arr)
    return div_arr

def log_return_np(x):
    return np.log(x / x.shift(60)).fillna(0)


def generate_features(df, time : int):
    directory = '../data/processed_data/'

    # df = pd.concat([df_btc, df_eth], axis = 0, ignore_index = True)
    df = df.sort_values(by = ['open_time'], ignore_index = True)
    # calculate next 15min returns (ie: current open_time is 2020-01-01 00:00:00,
    # then return is from 2020-01-01 00:01:00 - 2020-01-01 00:16:00

    # bt.logging.debug(f'Computing sma')
    times = [(10, '10m'), (15, '15m'), (30, '30m'), (60, '60m'), (120, '120m'),
         (240, '240m'), (480, '480m'), (720, '720m'), (1440, '1440m'),
         (2880, '2880m')]
    # bt.logging.debug(f'Computing sma')
    lags = [60]
    for lag in lags:
        df[f'log_close/mean_{lag}'] = gen_cross_features(df['close'], lag=lag)
        df[f'log_return_{lag}'] = log_return_np(df['close'])
    df['mid_diff'] = (df['close'] - df['open']) / ((df['high'] - df['low']) + 0.001)

    sma_lags = [5, 15, 30, 60, 120, 240, 800]
    for sma_lag in sma_lags:
        df[f'sma{sma_lag}'] = (df['close'].rolling(sma_lag, min_periods=1).mean())
        df[f'sma{sma_lag}'] = (df[f'sma{sma_lag}'] / df['close']) - 1
        df[f'return{sma_lag}'] = df['close'].pct_change(sma_lag)
        df[f'volume_change_{sma_lag}'] = df['volume'].pct_change(sma_lag)

    hull_lags = [76, 240, 800]
    for hull_lag in hull_lags:
        df[f'hull_{hull_lag}'] = hullMA(df['close'], hull_lag)

    fibo_list = [55, 210, 340, 890, 3750]
    for num in fibo_list:
        df[f'log_return_{num}'] = np.log(df['close']).diff().rolling(num, min_periods=1).mean().ffill().bfill()

    df = transform_time(df)
    sma_diff_windows = [(12, 26), (12*4*4, 24*4*4), (12*4*4*4, 24*4*4*4), (12*4*4*4*4, 24*4*4*4*4), (12*4*4*4*4*4, 24*4*4*4*4*4)]

    for short_win, long_win in sma_diff_windows:
        df[f'sma_diff_{short_win}'] = calc_sma_diff_test(df['close'], int(short_win), int(long_win))

    df[f'sma_diff_vol_{12 * 4 * 4}'] = calc_sma_diff_test(df['volume'], 12 * 4 * 4, 24 * 4 * 4)

    return df

def generate_features_new(df, time : int):
    directory = '../data/processed_data/'

    # df = pd.concat([df_btc, df_eth], axis = 0, ignore_index = True)
    df = df.sort_values(by = ['open_time'], ignore_index = True)
    # calculate next 15min returns (ie: current open_time is 2020-01-01 00:00:00,
    # then return is from 2020-01-01 00:01:00 - 2020-01-01 00:16:00

    # bt.logging.debug(f'Computing sma')
    times = [(10, '10m'), (15, '15m'), (30, '30m'), (60, '60m'), (120, '120m'),
         (240, '240m'), (480, '480m'), (720, '720m'), (1440, '1440m'),
         (2880, '2880m')]
    # bt.logging.debug(f'Computing sma')
    lags = [(x // time, y) for x, y in times]
    for lag, label in lags:
        df[f'log_close/mean_{lag}'] = gen_cross_features(df['close'], lag=lag)
        df[f'log_return_{lag}'] = log_return_np(df['close'])
    df['mid_diff'] = (df['close'] - df['open']) / ((df['high'] - df['low']) + 0.001)

    for sma_lag, label in lags:
        df[f'sma{sma_lag}'] = (df['close'].rolling(sma_lag, min_periods=1).mean())
        df[f'sma{sma_lag}'] = (df[f'sma{sma_lag}'] / df['close']) - 1
        df[f'return{sma_lag}'] = df['close'].pct_change(sma_lag)
        df[f'volume_change_{sma_lag}'] = df['volume'].pct_change(sma_lag)

    for hull_lag, label in lags:
        df[f'hull_{hull_lag}'] = hullMA(df['close'], hull_lag)

    for num, label in lags:
        df[f'log_return_{num}'] = np.log(df['close']).diff().rolling(num, min_periods=1).mean().ffill().bfill()

    df = transform_time(df)
    sma_diff_windows = [(12, 26), (12*4*4, 24*4*4), (12*4*4*4, 24*4*4*4), (12*4*4*4*4, 24*4*4*4*4), (12*4*4*4*4*4, 24*4*4*4*4*4)]

    for short_win, long_win in sma_diff_windows:
        df[f'sma_diff_{short_win}'] = calc_sma_diff_test(df['close'], int(short_win), int(long_win))

    df[f'sma_diff_vol_{12 * 4 * 4}'] = calc_sma_diff_test(df['volume'], 12 * 4 * 4, 24 * 4 * 4)

    return df


In [17]:
universe = ['BTCUSDT']
columns_futures = ['open_time', 'open', 'high', 'low', 'close', 'volume',
           'close_time', 'quote_asset_volume', 'number_of_trades',
           'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume',
           'ignore']

columns_spot = ['open_time', 'open', 'high', 'low', 'close', 'volume',
           'close_time', 'quote_asset_volume', 'number_of_trades',
           'taker_buy_base_volume', 'taker_buy_quote_asset_volume', 'ignore']

path = pathlib.Path.cwd().parent / 'data' / 'processed_futures'
dfs = load_klines_data(universe, columns_spot, time = '1m')

  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2it [00:00, 11.75it/s][A
4it [00:00, 13.75it/s][A
6it [00:00, 14.14it/s][A
8it [00:00, 14.04it/s][A
10it [00:00, 13.99it/s][A
12it [00:00, 14.05it/s][A
14it [00:01, 14.19it/s][A
16it [00:01, 14.53it/s][A
18it [00:01, 14.42it/s][A
20it [00:01, 14.60it/s][A
22it [00:01, 14.54it/s][A
24it [00:01, 15.03it/s][A
26it [00:01, 15.43it/s][A
28it [00:01, 15.32it/s][A
30it [00:02, 15.02it/s][A
32it [00:02, 15.13it/s][A
34it [00:02, 15.19it/s][A
37it [00:02, 16.53it/s][A
39it [00:02, 15.72it/s][A
41it [00:02, 15.32it/s][A
43it [00:02, 15.15it/s][A
45it [00:03, 15.25it/s][A
47it [00:03, 14.97it/s][A
49it [00:03, 15.01it/s][A
51it [00:03, 14.96it/s][A
53it [00:03, 14.69it/s][A
55it [00:03, 14.61it/s][A
57it [00:03, 14.88it/s][A
59it [00:03, 14.86it/s][A
61it [00:04, 15.07it/s][A
63it [00:04, 15.21it/s][A
65it [00:04, 15.42it/s][A
67it [00:04, 15.19it/s][A
69it [00:04, 15.30it/s][A
71it [00:04, 15.16it/s][A

75



0it [00:00, ?it/s][A
10it [00:00, 99.79it/s][A
29it [00:00, 101.69it/s][A


29


  if _pandas_api.is_sparse(col):
100%|██████████| 1/1 [12:21<00:00, 741.53s/it]


In [18]:
df_spot = dfs['BTCUSDT'].copy()

In [23]:
df_feature = generate_features(df_spot, time = 1)

In [30]:
features_to_remove = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time',
                      'quote_asset_volume', 'ignore', 'taker_buy_base_volume', 'taker_buy_quote_asset_volume',
                      'token']

In [31]:
features = [x for x in df_feature.columns if x not in features_to_remove]

In [33]:
corrs = {}
freq = [(10, '10m'), (15, '15m'), (30, '30m'), (60, '60m'), (120, '120m'),
         (240, '240m'), (480, '480m')]
for time, label in tqdm(freq):
    df_feature[f'target_{label}'] = df_feature['close'].pct_change(-1)
    corr = df_feature[features + [f'target_{label}']].corr()[f'target_{label}']
    df_feature = df_feature.drop([f'target_{label}'], axis = 1)
    corrs[label] = corr.copy()

for k, v in corrs.items():
    v.to_csv(f'../output/feature_corr_1m/{k}_corr.csv')

100%|██████████| 7/7 [19:33<00:00, 167.59s/it]


In [43]:
corrs_all = pd.concat(list(corrs.values()), axis = 1)
labels = [f'target_{x[1]}' for x in freq]
keep_idx = [x for x in corrs_all.index if x not in labels]
mean_corr = corrs_all.mean(axis = 1).abs().loc[keep_idx].sort_values(ascending = False)
mean_corr.to_csv('../output/feature_corr_1m/mean_corr.csv')

In [27]:
df_feature.to_feather('../data/df_btc_with_features_1m_spot.feather')

  if _pandas_api.is_sparse(col):


# New Feature Generation

In [7]:
universe = ['BTCUSDT']
columns_futures = ['open_time', 'open', 'high', 'low', 'close', 'volume',
           'close_time', 'quote_asset_volume', 'number_of_trades',
           'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume',
           'ignore']

columns_spot = ['open_time', 'open', 'high', 'low', 'close', 'volume',
           'close_time', 'quote_asset_volume', 'number_of_trades',
           'taker_buy_base_volume', 'taker_buy_quote_asset_volume', 'ignore']

path = pathlib.Path.cwd().parent / 'data' / 'processed_futures'
dfs = load_klines_data(universe, columns_spot, time = '5m')

  0%|          | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
5it [00:00, 49.90it/s][A
11it [00:00, 53.43it/s][A
17it [00:00, 54.70it/s][A
23it [00:00, 56.69it/s][A
29it [00:00, 57.31it/s][A
35it [00:00, 57.90it/s][A
41it [00:00, 58.19it/s][A
48it [00:00, 60.56it/s][A
55it [00:00, 59.91it/s][A
61it [00:01, 59.67it/s][A
67it [00:01, 58.36it/s][A
75it [00:01, 58.19it/s][A


75



0it [00:00, ?it/s][A
29it [00:00, 189.19it/s][A


29


  if _pandas_api.is_sparse(col):
100%|██████████| 1/1 [02:30<00:00, 150.06s/it]


In [8]:
df_spot = dfs['BTCUSDT'].copy()

In [9]:
df_feature = generate_features_new(df_spot, time = 5)

In [10]:
features_to_remove = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time',
                      'quote_asset_volume', 'ignore', 'taker_buy_base_volume', 'taker_buy_quote_asset_volume',
                      'token']
features = [x for x in df_feature.columns if x not in features_to_remove]

In [11]:
corrs = {}
freq = [(10, '10m'), (15, '15m'), (30, '30m'), (60, '60m'), (120, '120m'),
         (240, '240m'), (480, '480m')]

for time, label in tqdm(freq):
    df_feature[f'target_{label}'] = df_feature['close'].pct_change(-1)
    corr = df_feature[features + [f'target_{label}']].corr()[f'target_{label}']
    df_feature = df_feature.drop([f'target_{label}'], axis = 1)
    corrs[label] = corr.copy()

for k, v in corrs.items():
    v.to_csv(f'../output/feature_corr/{k}_corr_new_features.csv')

100%|██████████| 7/7 [04:02<00:00, 34.64s/it]


In [12]:
corrs_all = pd.concat(list(corrs.values()), axis = 1)
labels = [f'target_{x[1]}' for x in freq]
keep_idx = [x for x in corrs_all.index if x not in labels]
mean_corr = corrs_all.mean(axis = 1).abs().loc[keep_idx].sort_values(ascending = False)
mean_corr.to_csv('../output/feature_corr/mean_corr_new_features.csv')

In [17]:
mean_corr

sma3                0.059828
sma2                0.057268
others_dlr          0.056960
others_dr           0.056663
log_return_2        0.052969
                      ...   
Day_sin             0.000333
volume_obv          0.000241
month_sin           0.000201
volatility_kcp      0.000126
volume_change_48    0.000097
Length: 160, dtype: float64

In [13]:
df_feature.to_feather('../data/df_btc_with_features_5m_spot_new_features.feather')

  if _pandas_api.is_sparse(col):


In [3]:
from binance.spot import Spot
import time
from datetime import datetime, timedelta

exchange_key = 'HHwPdDKvCO3zxbLqQbjxcb0N2wDNDs2aD5aIz3M3GsuiIntsgfV0wWWIaqfnKriw'
end_time = round(time.time() * 1000)
end_datetime = datetime.fromtimestamp(end_time / 1000.0)


client = Spot(api_key = exchange_key)

In [5]:
btc_data = client.klines(symbol = 'BTCUSDT', interval = '5m', limit = 1000)

In [9]:
import pandas as pd
exchange_key = 'HHwPdDKvCO3zxbLqQbjxcb0N2wDNDs2aD5aIz3M3GsuiIntsgfV0wWWIaqfnKriw'
end_time = round(time.time() * 1000)
end_datetime = datetime.fromtimestamp(end_time / 1000.0)

client = Spot(api_key = exchange_key)

kline = client.klines("BTCUSDT", '5m', endTime = end_time, limit = 1000)

columns_spot = ['open_time', 'open', 'high', 'low', 'close', 'volume',
                'close_time', 'quote_asset_volume', 'number_of_trades',
                'taker_buy_base_volume', 'taker_buy_quote_asset_volume', 'ignore']

df = pd.DataFrame(kline, columns = columns_spot)
df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')

NameError: name 'pd' is not defined

In [11]:
df_prev = pd.read_feather('../data/df_btc_with_features_5m_spot.feather')

In [12]:
cols = pd.DataFrame(df_prev.columns)

In [13]:
cols

Unnamed: 0,0
0,open_time
1,open
2,high
3,low
4,close
...,...
174,target_420m
175,close_90lag
176,target_450m
177,close_96lag
