## Ensure stationarity of explanatory variables and merge with targets

In [1]:
import numpy as np
import pandas as pd
import warnings

from functions import unit_root_testing, HeskedTesting
# Ẩn tất cả cảnh báo RuntimeWarning
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [2]:
btc_data = pd.read_parquet('btc_numeric_data.parquet.gzip')
eth_data = pd.read_parquet('eth_numeric_data.parquet.gzip')

### Check stationarity

#### Perform heteroskedasticity tests

In [3]:
btc_data_nonan = btc_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(btc_data_nonan, conf=0.01, tabsize=58)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

btc_total_volume --                                        White: [31m0.0000[0m,                                  Breusch-Pagan: [31m0.0000[0m,                           Goldfeld-Quandt: [31m0.0000[0m
btc_price_close --                                         White: [31m0.0000[0m,                                  Breusch-Pagan: [31m0.0000[0m,                           Goldfeld-Quandt: [31m0.0000[0m
btc_ETH_volumefrom --                                      White: [31m0.0000[0m,                                  Breusch-Pagan: [32m0.5173[0m,                           Goldfeld-Quandt: [32m0.0553[0m
btc_ETH_volumeto --                                        White: [31m0.0000[0m,                                  Breusch-Pagan: [32m0.8011[0m,                           Goldfeld-Quandt: [31m0.0000[0m
btc_USD_volumefrom --                                      White: [31m0.0000[0m,         

In [4]:
eth_data_nonan = eth_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(eth_data_nonan, conf=0.01, tabsize=45)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

eth_total_volume --                           White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_price_close --                            White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumefrom --                         White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumeto --                           White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_USD_volumefrom --                         White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_USD_volumeto --                           White: [31m0.0000[0m,          

#### Perform unit root tests

In [5]:
btc_data_nonan = btc_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
unit_root_testing(btc_data_nonan, conf=0.01, tabsize=58)

Results of ADF, PP and KPSS tests by column (p-values):

btc_total_volume --                                        ADF: [31m0.2022[0m,                                    PP: [32m0.0000[0m,                                      KPSS: [31m0.0001[0m
btc_price_close --                                         ADF: [31m0.6915[0m,                                    PP: [31m0.9086[0m,                                      KPSS: [31m0.0001[0m
btc_ETH_volumefrom --                                      ADF: [31m0.0230[0m,                                    PP: [32m0.0000[0m,                                      KPSS: [31m0.0001[0m
btc_ETH_volumeto --                                        ADF: [31m0.0425[0m,                                    PP: [32m0.0000[0m,                                      KPSS: [31m0.0001[0m
btc_USD_volumefrom --                                      ADF: [32m0.0000[0m,                                    PP: [32m0.0000[0m,                       

In [6]:
unit_root_testing(eth_data_nonan, conf=0.01, tabsize=45)

Results of ADF, PP and KPSS tests by column (p-values):

eth_total_volume --                           ADF: [31m0.0287[0m,                       PP: [32m0.0000[0m,                         KPSS: [31m0.0001[0m
eth_price_close --                            ADF: [31m0.2479[0m,                       PP: [31m0.4208[0m,                         KPSS: [31m0.0001[0m
eth_BTC_volumefrom --                         ADF: [31m0.0436[0m,                       PP: [32m0.0000[0m,                         KPSS: [31m0.0001[0m
eth_BTC_volumeto --                           ADF: [32m0.0054[0m,                       PP: [32m0.0000[0m,                         KPSS: [31m0.0001[0m
eth_USD_volumefrom --                         ADF: [31m0.0223[0m,                       PP: [32m0.0000[0m,                         KPSS: [31m0.0001[0m
eth_USD_volumeto --                           ADF: [32m0.0022[0m,                       PP: [32m0.0000[0m,                         KPSS: [31m0.0001[0m
e

### Log difference non-stationary variables

In [7]:
btc_gtrends_vars = [
    'gtrends_bitcoin_relative_change',
    'gtrends_cryptocurrency_relative_change',
    'gtrends_blockchain_relative_change',
    'gtrends_investing_relative_change',
]

btc_stationary_vars = [
    'btc_indicator_PSAR_down',
    'btc_indicator_PSAR_up',
]

eth_gtrends_vars = [
    'gtrends_ethereum_relative_change',
    'gtrends_cryptocurrency_relative_change',
    'gtrends_blockchain_relative_change',
    'gtrends_investing_relative_change',
]

eth_stationary_vars = [
    'eth_indicator_PSAR_down',
    'eth_indicator_PSAR_up',
]

In [8]:
def log_difference_dataframe(df: pd.DataFrame, gtrends_vars: list, stationary_vars: list):
    ''' Returns dataframe where all non-stationary variables are differenced once. '''
    
    # create empty dataframe
    df_differenced = pd.DataFrame(index=df.index)

    # add first order log difference of all non-stationary variables suffixed with '_d'
    for column in [i for i in list(df.columns) if (i not in gtrends_vars and i not in stationary_vars)]:
        with np.errstate(divide='ignore', invalid='ignore'):
            df_differenced[column + '_d'] = np.diff(np.log(df[column] + 0.01), prepend=float('nan'))

    # add log of Google Trends variables, since they're already represented by a relative difference
    for column in gtrends_vars:
        with np.errstate(divide='ignore', invalid='ignore'):
            df_differenced[column + '_d'] = np.log(1 + (df[column] / 100))
            
    # add variables that are already stationary in unchanged form
    for column in stationary_vars:
        df_differenced[column] = df[column]

    return df_differenced

In [9]:

btc_differenced_data = log_difference_dataframe(btc_data, btc_gtrends_vars, btc_stationary_vars)
eth_differenced_data = log_difference_dataframe(eth_data, eth_gtrends_vars, eth_stationary_vars)

  df_differenced[column] = df[column]
  df_differenced[column] = df[column]


### Re-check stationarity

#### Perform heteroskedasticity tests

In [10]:
btc_differenced_data_nonan = btc_differenced_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(btc_differenced_data_nonan, conf=0.01, tabsize=60)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

btc_total_volume_d --                                        White: [31m0.0000[0m,                                    Breusch-Pagan: [32m0.0825[0m,                             Goldfeld-Quandt: [31m0.0002[0m
btc_price_close_d --                                         White: [31m0.0000[0m,                                    Breusch-Pagan: [31m0.0000[0m,                             Goldfeld-Quandt: [31m0.0000[0m
btc_ETH_volumefrom_d --                                      White: [32m0.0432[0m,                                    Breusch-Pagan: [32m0.0127[0m,                             Goldfeld-Quandt: [31m0.0005[0m
btc_ETH_volumeto_d --                                        White: [32m0.6378[0m,                                    Breusch-Pagan: [32m0.3797[0m,                             Goldfeld-Quandt: [31m0.0030[0m
btc_USD_volumefrom_d --                                      White:

In [11]:
eth_differenced_data_nonan = eth_differenced_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(eth_differenced_data_nonan, conf=0.01, tabsize=45)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

eth_total_volume_d --                         White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0002[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_price_close_d --                          White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumefrom_d --                       White: [31m0.0000[0m,                     Breusch-Pagan: [32m0.0162[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumeto_d --                         White: [31m0.0000[0m,                     Breusch-Pagan: [32m0.0312[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_USD_volumefrom_d --                       White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0018[0m,              Goldfeld-Quandt: [32m0.5606[0m
eth_USD_volumeto_d --                         White: [31m0.0000[0m,          

#### Perform unit root tests

In [12]:

unit_root_testing(btc_differenced_data_nonan, conf=0.01, tabsize=60)

Results of ADF, PP and KPSS tests by column (p-values):

btc_total_volume_d --                                        ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [36mInfeasibleTestException[0m
btc_price_close_d --                                         ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.2532[0m
btc_ETH_volumefrom_d --                                      ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.2725[0m
btc_ETH_volumeto_d --                                        ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.3887[0m
btc_USD_volumefrom_d --                                      ADF: [32m0.0000[0m,                                    

In [13]:
unit_root_testing(eth_differenced_data_nonan, conf=0.01, tabsize=45)

Results of ADF, PP and KPSS tests by column (p-values):

eth_total_volume_d --                         ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_price_close_d --                          ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [32m0.0120[0m
eth_BTC_volumefrom_d --                       ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_BTC_volumeto_d --                         ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_USD_volumefrom_d --                       ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_USD_volumeto_d --                         ADF: [32m0.0000[0m,                       

### Re-re-check stationarity

#### Perform heteroskedasticity tests

In [20]:
btc_differenced_data_nonan = btc_differenced_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(btc_differenced_data_nonan, conf=0.01, tabsize=60)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

btc_total_volume_d --                                        White: [31m0.0000[0m,                                    Breusch-Pagan: [32m0.0825[0m,                             Goldfeld-Quandt: [31m0.0002[0m
btc_price_close_d --                                         White: [31m0.0000[0m,                                    Breusch-Pagan: [31m0.0000[0m,                             Goldfeld-Quandt: [31m0.0000[0m
btc_ETH_volumefrom_d --                                      White: [32m0.0432[0m,                                    Breusch-Pagan: [32m0.0127[0m,                             Goldfeld-Quandt: [31m0.0005[0m
btc_ETH_volumeto_d --                                        White: [32m0.6378[0m,                                    Breusch-Pagan: [32m0.3797[0m,                             Goldfeld-Quandt: [31m0.0030[0m
btc_USD_volumefrom_d --                                      White:

In [21]:
eth_differenced_data_nonan = eth_differenced_data.dropna(axis=1, how='all')  # bỏ cột toàn NaN
HeskedTesting.run_all_tests(eth_differenced_data_nonan, conf=0.01, tabsize=45)

Results of White, Breusch-Pagan and Goldfeld-Quandt tests by column (p-values):

eth_total_volume_d --                         White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0002[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_price_close_d --                          White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0000[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumefrom_d --                       White: [31m0.0000[0m,                     Breusch-Pagan: [32m0.0162[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_BTC_volumeto_d --                         White: [31m0.0000[0m,                     Breusch-Pagan: [32m0.0312[0m,              Goldfeld-Quandt: [31m0.0000[0m
eth_USD_volumefrom_d --                       White: [31m0.0000[0m,                     Breusch-Pagan: [31m0.0018[0m,              Goldfeld-Quandt: [32m0.5606[0m
eth_USD_volumeto_d --                         White: [31m0.0000[0m,          

#### Perform unit root tests

In [22]:
unit_root_testing(btc_differenced_data, conf=0.01, tabsize=60)

Results of ADF, PP and KPSS tests by column (p-values):

btc_total_volume_d --                                        ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [36mInfeasibleTestException[0m
btc_price_close_d --                                         ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.2532[0m
btc_ETH_volumefrom_d --                                      ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.2725[0m
btc_ETH_volumeto_d --                                        ADF: [32m0.0000[0m,                                      PP: [32m0.0000[0m,                                        KPSS: [32m0.3887[0m
btc_USD_volumefrom_d --                                      ADF: [32m0.0000[0m,                                    

In [23]:
unit_root_testing(eth_differenced_data, conf=0.01, tabsize=45)

Results of ADF, PP and KPSS tests by column (p-values):

eth_total_volume_d --                         ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_price_close_d --                          ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [32m0.0120[0m
eth_BTC_volumefrom_d --                       ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_BTC_volumeto_d --                         ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_USD_volumefrom_d --                       ADF: [32m0.0000[0m,                       PP: [32m0.0000[0m,                         KPSS: [36mInfeasibleTestException[0m
eth_USD_volumeto_d --                         ADF: [32m0.0000[0m,                       

### Remove specific outliers

In [26]:
btc_differenced_data.loc[1438905600, 'btc_ETH_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1438905600, 'btc_ETH_volumeto_d'] = float('nan')
btc_differenced_data.loc[1315180800, 'btc_EUR_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1315180800, 'btc_EUR_volumeto_d'] = float('nan')
btc_differenced_data.loc[1515715200, 'btc_exchange_Kraken_volumeto_d'] = float('nan')
btc_differenced_data.loc[1515715200, 'btc_exchange_Kraken_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1515715200, 'btc_exchange_Kraken_volumetotal_d'] = float('nan')
btc_differenced_data.loc[1515801600, 'btc_exchange_Kraken_volumeto_d'] = float('nan')
btc_differenced_data.loc[1515801600, 'btc_exchange_Kraken_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1515801600, 'btc_exchange_Kraken_volumetotal_d'] = float('nan')
btc_differenced_data.loc[1464307200, 'btc_exchange_Coinbase_volumeto_d'] = float('nan')
btc_differenced_data.loc[1647734400, 'btc_exchange_BTSE_volumeto_d'] = float('nan')
btc_differenced_data.loc[1647734400, 'btc_exchange_BTSE_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1647734400, 'btc_exchange_BTSE_volumetotal_d'] = float('nan')
btc_differenced_data.loc[1518048000, 'btc_exchange_Binance_volumeto_d'] = float('nan')
btc_differenced_data.loc[1518048000, 'btc_exchange_Binance_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1518048000, 'btc_exchange_Binance_volumetotal_d'] = float('nan')
btc_differenced_data.loc[1518134400, 'btc_exchange_Binance_volumeto_d'] = float('nan')
btc_differenced_data.loc[1518134400, 'btc_exchange_Binance_volumefrom_d'] = float('nan')
btc_differenced_data.loc[1518134400, 'btc_exchange_Binance_volumetotal_d'] = float('nan')
btc_differenced_data.loc[1502928000, 'btc_balance_distribution_from_100000.0_totalVolume_d'] = float('nan')
btc_differenced_data.loc[1503014400, 'btc_balance_distribution_from_100000.0_totalVolume_d'] = float('nan')
btc_differenced_data.loc[1502928000, 'btc_balance_distribution_from_100000.0_addressesCount_d'] = float('nan')
btc_differenced_data.loc[1503014400, 'btc_balance_distribution_from_100000.0_addressesCount_d'] = float('nan')

In [None]:
eth_differenced_data.loc[1666051200, 'eth_staking_rate_d2'] = float('nan')
eth_differenced_data.loc[1635379200, 'eth_staking_rate_d2'] = float('nan')
eth_differenced_data.loc[1512518400, 'eth_total_volume_d'] = float('nan')
eth_differenced_data.loc[1512604800, 'eth_total_volume_d'] = float('nan')
eth_differenced_data.loc[1498694400, 'eth_exchange_Bitfinex_volumeto_d'] = float('nan')
eth_differenced_data.loc[1511654400, 'eth_exchange_Bitfinex_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1511740800, 'eth_exchange_Bitfinex_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1457913600, 'eth_exchange_Bitfinex_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1511654400, 'eth_exchange_Bitfinex_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1511740800, 'eth_exchange_Bitfinex_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1469577600, 'eth_exchange_Kraken_volumeto_d'] = float('nan')
eth_differenced_data.loc[1515801600, 'eth_exchange_Kraken_volumeto_d'] = float('nan')
eth_differenced_data.loc[1515801600, 'eth_exchange_Kraken_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1515801600, 'eth_exchange_Kraken_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1515715200, 'eth_exchange_Kraken_volumeto_d'] = float('nan')
eth_differenced_data.loc[1515715200, 'eth_exchange_Kraken_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1515715200, 'eth_exchange_Kraken_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1557532800, 'eth_exchange_Coinbase_volumeto_d'] = float('nan')
eth_differenced_data.loc[1678924800, 'eth_exchange_BTSE_volumeto_d'] = float('nan')
eth_differenced_data.loc[1678924800, 'eth_exchange_BTSE_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1678924800, 'eth_exchange_BTSE_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1647734400, 'eth_exchange_BTSE_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1647734400, 'eth_exchange_BTSE_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1501113600, 'eth_exchange_Binance_volumeto_d'] = float('nan')
eth_differenced_data.loc[1518134400, 'eth_exchange_Binance_volumeto_d'] = float('nan')
eth_differenced_data.loc[1518134400, 'eth_exchange_Binance_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1518134400, 'eth_exchange_Binance_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1518048000, 'eth_exchange_Binance_volumeto_d'] = float('nan')
eth_differenced_data.loc[1518048000, 'eth_exchange_Binance_volumefrom_d'] = float('nan')
eth_differenced_data.loc[1518048000, 'eth_exchange_Binance_volumetotal_d'] = float('nan')
eth_differenced_data.loc[1476921600, 'eth_new_addresses_d'] = float('nan')
eth_differenced_data.loc[1476230400, 'eth_new_addresses_d'] = float('nan')
eth_differenced_data.loc[1476921600, 'eth_active_addresses_d'] = float('nan')
eth_differenced_data.loc[1476230400, 'eth_active_addresses_d'] = float('nan')
eth_differenced_data.loc[1479945600, 'eth_active_addresses_d'] = float('nan')
eth_differenced_data.loc[1480550400, 'eth_active_addresses_d'] = float('nan')

### Limit timeframe to where all data is available

In [27]:
btc_differenced_data = btc_differenced_data.loc[1314662400:1763596800]
eth_differenced_data = eth_differenced_data.loc[1445472000:1763596800]

### Save to parquet

In [28]:
btc_differenced_data.to_parquet('btc_numeric_stationary_data.parquet.gzip',
                                compression='gzip')
eth_differenced_data.to_parquet('eth_numeric_stationary_data.parquet.gzip',
                                compression='gzip')