In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from OU import OU
import pickle
import talib

In [38]:
#loading data
bitcoin_futures_df = pd.read_csv('/Users/answer/Desktop/paper/DATA/bitcoin_futures_df.csv')
bitcoin_spot_df = pd.read_csv('/Users/answer/Desktop/paper/DATA/bitcoin_spot_df.csv')

In [39]:
bitcoin_futures_df.shape

(46790, 13)

In [40]:
bitcoin_spot_df.shape

(46790, 13)

In [41]:
feature_window = 5

In [42]:
#calculate EWMA index
def sma(prices, window):
    return prices.rolling(window).mean()[window:]

In [43]:
#calculate EWMA index
def ewma(prices, window):                   
    return pd.Series.ewm(prices, span=window).mean()[window:]

In [44]:
#calculate MFI index
def mfi(df, window):
    mf = (df['HIGH'] + df['LOW'] + df['CLOSE']) / 3
    i = 0
    pos_mf = [0]
    neg_mf = [0]
    
    # Find Positive Money FLOWs and Negative Money FLOWs
    while i < df.index[-1]:
        if mf[i + 1] > mf[i]:
            pos_mf.append(mf[i + 1] * df.loc[i + 1, 'VOLUME'])
            neg_mf.append(0)
        else:
            pos_mf.append(0)
            neg_mf.append(mf[i + 1] * df.loc[i + 1, 'VOLUME'])
        i = i + 1
        
    pos_mf = pd.Series(pos_mf)
    neg_mf = pd.Series(neg_mf)
    
    # Apply a cumulative sum over every window
    pos_sum = pd.Series(pos_mf.rolling(window).sum())
    neg_sum = pd.Series(neg_mf.rolling(window).sum())
    
    mfr = (window - pos_sum)/(window-neg_sum)
    mfi = abs(100 - (100 / (1+mfr)))
    
    return mfi[window:]

In [45]:
#calculate rsi index
def rsi(df, window):
    i = 0
    pos_period = [0]
    neg_period = [0]
    
    # Identify positive and negative closing periods
    while i < df.index[-1]:
        if df.at[i+1, 'CLOSE'] > df.at[i, 'CLOSE']:
            pos_period.append(df.at[i+1, 'CLOSE'])
            neg_period.append(0)
        else:
            pos_period.append(0)
            neg_period.append(df.at[i, 'CLOSE'])
        i = i + 1
        
    pos_period = pd.Series(pos_period)
    neg_period = pd.Series(neg_period)
    
    # Apply rolling sum over all of our windows
    pos_sum = pd.Series(pos_period.rolling(window).sum())
    neg_sum = pd.Series(neg_period.rolling(window).sum())
    
    # Normalize with window length to ensure that we don't have 0 in the denominator
    rs = (window - pos_sum)/(window-neg_sum)
    rsi = abs(100 - (100 / (1+rs)))
    
    return rsi[window:]

In [46]:
#set the rolling window to 5
feature_window = 5
# calculating indexes of two series
bitcoin_spot_df['sma'] = sma(bitcoin_spot_df['CLOSE'], feature_window).pct_change()
bitcoin_futures_df['sma'] = sma(bitcoin_futures_df['CLOSE'], feature_window).pct_change()

bitcoin_spot_df['ewma'] = ewma(bitcoin_spot_df['CLOSE'], feature_window).pct_change()
bitcoin_futures_df['ewma'] = ewma(bitcoin_futures_df['CLOSE'], feature_window).pct_change()

bitcoin_spot_df['mfi'] = mfi(bitcoin_spot_df, feature_window).pct_change()
bitcoin_futures_df['mfi'] = mfi(bitcoin_futures_df, feature_window).pct_change()

bitcoin_spot_df['rsi'] = rsi(bitcoin_spot_df, feature_window).pct_change()
bitcoin_futures_df['rsi'] = rsi(bitcoin_futures_df, feature_window).pct_change()

bitcoin_spot_df['price'] = bitcoin_spot_df['CLOSE'].pct_change()
bitcoin_futures_df['price'] = bitcoin_futures_df['CLOSE'].pct_change()

# remove NaNs from rolling features
btc_spot_df = bitcoin_spot_df[feature_window+1:].reset_index(drop=True)
btc_futures_df = bitcoin_futures_df[feature_window+1:].reset_index(drop=True)

In [55]:
btc_spot_df.to_csv('/Users/answer/Desktop/paper/DATA/btc_spot_df.csv')
btc_futures_df.to_csv('/Users/answer/Desktop/paper/DATA/btc_futures_df.csv')

In [47]:
#label data as 1 if the spread change exceeds the threshold
def create_label_func(threshold=0.001, window=5):

    def create_labels(residuals):
        min_value = residuals[::-1].rolling(window=window).min()[::-1]
        min_value.iloc[-window:] = residuals.iloc[-window:]

        a = (residuals-min_value) > threshold
        classes = a.astype(int)
        
        return classes
    
    return create_labels

In [58]:
#set threshold to 0.0003
threshold=0.0003
#set the trading window to 5
window=5

label_func = create_label_func(threshold=threshold, window=window)

In [59]:
btc_futures_df.shape

(46784, 18)

In [60]:
btc_spot_df.shape

(46784, 18)

In [61]:
OU_transform = OU(btc_futures_df, btc_spot_df)

In [62]:
OU_transform.split_slide(m_size=2000, e_size=100)

Sliding Window Split Successful.


In [63]:
import scipy

In [64]:
info = OU_transform.get_splits(['price', 'sma', 'ewma', 'mfi', 'rsi'], label_func=label_func, scale=True)

In [65]:
save_dir = "/Users/answer/Desktop/paper/DATA/info.npy"

np.save(save_dir, info)

In [66]:
labels = np.hstack(np.array([fold['test']['labels'].values for fold in info]))
np.bincount(labels)

  """Entry point for launching an IPython kernel.


array([33384, 11400])

In [67]:
plot = True

In [69]:
#plot the residuals to find a good threshold
if plot: 
    for fold, thing in enumerate(info):
        train = thing['train']
        test = thing['test']

        train_residuals = train['residuals_fit_price']
        test_residuals = test['residuals_transform_price']
        window=5

        min_value_train = train_residuals[::-1].rolling(window=window).min()[::-1]
        min_value_test = test_residuals[::-1].rolling(window=window).min()[::-1]

        fig = plt.figure(figsize=(8, 6))
        ax1 = plt.subplot(1, 2, 1)
        ax2 = plt.subplot(1, 2, 2, sharey=ax1)


        ax1.boxplot((train_residuals - min_value_train)[:-window])
        ax1.set_title('TRAIN')
        ax1.axhline(threshold, linestyle='dashed', color='black', alpha=0.7)
        ax1.grid()

        ax2.boxplot((test_residuals - min_value_test)[:-window])
        ax2.set_title('TEST')

        ax2.grid()
        ax2.axhline(threshold, linestyle='dashed', color='black', alpha=0.7)

        plt.suptitle('Fold %d' % fold)
        plt.savefig("/Users/answer/Desktop/paper/DATA/PLOT/RESIDUAL/fold_residual_%i" % fold)
        plt.close(fig)

In [89]:
def indicators(df, feature_window):
    df["MA"] = talib.MA(df["CLOSE"], timeperiod = feature_window).pct_change()
    df['EMA'] = talib.EMA(df["CLOSE"], timeperiod=feature_window).pct_change()
    df['RSI']= talib.RSI(df['CLOSE'], timeperiod = feature_window).pct_change()
    df['MFI'] = talib.MFI(df['HIGH'], df['LOW'], df['CLOSE'], df['VOLUME'], timeperiod = feature_window).pct_change()
    df['CHANGE_RATE'] = df['CLOSE'].pct_change()
    return df

In [90]:
btc_futures_df = indicators(bitcoin_futures_df, feature_window)[feature_window:]
btc_spot_df = indicators(bitcoin_spot_df, feature_window)[feature_window:]

In [91]:
btc_spot_df.tail()

Unnamed: 0,TIMESTAMP,TIMESTAMP.1,OPEN,HIGH,LOW,CLOSE,VOLUME,close_time,quote_volume,trades,taker_base_volue,taker_quote_volume,ignore,MA,EMA,RSI,MFI,CHANGE_RATE
59986,2020-08-08 22:16:00,08/08/2020 22:16,11733.09,11733.51,11732.0,11732.74,24.811627,1596920000000.0,291108.3625,401,16.478222,193335.772,0,0.000119,2.3e-05,-0.017887,-0.07548,-2.2e-05
59987,2020-08-08 22:17:00,08/08/2020 22:17,11732.73,11739.99,11732.73,11738.49,30.190282,1596920000000.0,354355.8473,450,14.426976,169332.3285,0,5.9e-05,0.000179,0.198539,0.050704,0.00049
59988,2020-08-08 22:18:00,08/08/2020 22:18,11738.45,11739.36,11736.37,11738.86,9.452045,1596920000000.0,110953.3551,351,5.567218,65351.43837,0,2.4e-05,0.00013,0.008684,-0.136442,3.2e-05
59989,2020-08-08 22:19:00,08/08/2020 22:19,11738.74,11739.81,11736.09,11736.34,12.996434,1596920000000.0,152555.3517,320,5.057146,59362.00922,0,8e-05,1.5e-05,-0.180799,0.140473,-0.000215
59990,2020-08-08 22:20:00,08/08/2020 22:20,11736.35,11738.48,11732.13,11735.71,41.03901,1596920000000.0,481607.8189,523,27.73185,325460.896,0,4.6e-05,-8e-06,-0.053478,-0.145385,-5.4e-05


In [92]:
btc_spot_df.to_csv('/Users/answer/Desktop/paper/DATA/btc_spot_df.csv')
btc_futures_df.to_csv('/Users/answer/Desktop/paper/DATA/btc_futures_df.csv')

In [93]:
def label(threshold=0.001, window=5):
    """
    Given the spreads of residuals between our pair, label as 1 if the spread change exceeds 
    the threshold, and 0 otherwise. 
    
    :residuals: spreads between the residuals of the pair
    :window: how far in the future in minutes we want to evaluate our position on
    :threshold: the percent change threshold that determines whether a trade is profitable or not. 
    
    :ret: list of labels of 1 or -1.
    """
    def create_labels(residuals):
        min_val = residuals[::-1].rolling(window=window).min()[::-1]
        min_val.iloc[-window:] = residuals.iloc[-window:]

        a = (residuals-min_val) > threshold
        labels = a.astype(int)
        
        return labels
    
    return create_labels

In [94]:
threshold=0.0005
window=5

label_func = label(threshold=threshold, window=window)

In [95]:
OU_BTC = OU(btc_futures_df, btc_spot_df)

In [96]:
OU_BTC.split_slide(m_size=2000, e_size=100)

Sliding Window Split Successful.


In [97]:
labels = OU_BTC.get_splits(['RSI'], label_func=label_func, scale=True)

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'