In [1]:
import pandas as pd
import numpy as np

In [37]:
btc = pd.read_csv('gemini_BTCUSD_2020_1min.csv', 
                  parse_dates=['Date'], 
                  index_col='Date', 
                  usecols=['Date','Open','High','Low','Close','Volume'])
btcusd_daily = btc[['Open', 'High', 'Low', 'Close', 'Volume']].resample('D').agg({'Open':'first', 'High':'max', 'Low':'min', 'Close':'last', 'Volume':'sum'})
btcusd_daily.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,7165.9,7231.65,7146.6,7175.2,776.641615
2020-01-02,7175.2,7186.65,6901.02,6944.9,984.141603
2020-01-03,6944.9,7398.0,6860.0,7333.0,1292.330594
2020-01-04,7333.0,7394.4,7258.53,7352.82,402.636843
2020-01-05,7352.82,7490.17,7313.53,7351.77,554.773602


In [110]:
btcusd_daily.sort_index(ascending=True, inplace=True)

In [204]:
def series_to_supervised(data, y, n_in=1, n_out=1, dropnan=True, ascending=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    #ensure data is sorted properly
    data.sort_index(ascending=True, inplace=True)
    #save off column names
    columns = list(data.columns)
    #instantiate list to hold shifted dataframes
    cols = list()
    #append the t-0 data to the list
    cols.append(data)
    # input sequence (t-n, ... t-1)
    for i in range(1, n_in+1, 1):
        #instantiate dictionary for calumn names, used for renaming
        column_names = dict()
        #loop to add the proper t-n annotation to the column names
        for ii in columns:
            #insert they key and values to the column name dictionary for renaming
            column_names[ii] = str(ii)+'(t-'+str(i)+')'
        #append shifted dataframe to list of dataframes and rename columns based on dictionary of t-n annotations above
        cols.append(data.shift(i).rename(columns=column_names))
    #create large dataframe from list of dataframes
    agg = pd.concat(cols, axis=1)
    #drop records that have shifted nan values
    if dropnan:
        agg.dropna(inplace=True)
    #resort to descending if desired
    if not ascending:
        agg.sort_index(ascending=False, inplace=True)
    #if a y variabel has been determined, drop all other t-0 columns    
    if y:
        columns.remove(y)
        agg.drop(columns, axis=1, inplace=True)
    #return the finished dataframe
    return agg

In [203]:
series_to_supervised(btcusd_daily, n_in=4, dropnan=False, ascending=False, y='Close')

Unnamed: 0_level_0,Close,Open(t-1),High(t-1),Low(t-1),Close(t-1),Volume(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),...,Open(t-3),High(t-3),Low(t-3),Close(t-3),Volume(t-3),Open(t-4),High(t-4),Low(t-4),Close(t-4),Volume(t-4)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-20,55541.69,56292.67,57624.66,54302.02,55691.79,2027.757058,60068.01,60443.42,50622.32,56292.67,...,61436.65,62578.83,59725.18,60068.01,930.943922,63237.66,63602.96,60055.14,61436.65,2584.431750
2021-04-19,55691.79,60068.01,60443.42,50622.32,56292.67,5392.236906,61436.65,62578.83,59725.18,60068.01,...,63237.66,63602.96,60055.14,61436.65,2584.431750,62972.24,63855.12,62045.00,63237.66,1034.427743
2021-04-18,56292.67,61436.65,62578.83,59725.18,60068.01,930.943922,63237.66,63602.96,60055.14,61436.65,...,62972.24,63855.12,62045.00,63237.66,1034.427743,63619.79,64900.00,61303.97,62972.24,3249.552490
2021-04-17,60068.01,63237.66,63602.96,60055.14,61436.65,2584.431750,62972.24,63855.12,62045.00,63237.66,...,63619.79,64900.00,61303.97,62972.24,3249.552490,59863.12,63727.00,59800.00,63619.79,1879.313409
2021-04-16,61436.65,62972.24,63855.12,62045.00,63237.66,1034.427743,63619.79,64900.00,61303.97,62972.24,...,59863.12,63727.00,59800.00,63619.79,1879.313409,59980.60,61197.09,59400.01,59863.12,1240.628691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-05,7351.77,7333.00,7394.40,7258.53,7352.82,402.636843,6944.90,7398.00,6860.00,7333.00,...,7175.20,7186.65,6901.02,6944.90,984.141603,7165.90,7231.65,7146.60,7175.20,776.641615
2020-01-04,7352.82,6944.90,7398.00,6860.00,7333.00,1292.330594,7175.20,7186.65,6901.02,6944.90,...,7165.90,7231.65,7146.60,7175.20,776.641615,,,,,
2020-01-03,7333.00,7175.20,7186.65,6901.02,6944.90,984.141603,7165.90,7231.65,7146.60,7175.20,...,,,,,,,,,,
2020-01-02,6944.90,7165.90,7231.65,7146.60,7175.20,776.641615,,,,,...,,,,,,,,,,


In [269]:
def series_to_supervised_returns(data, length, functions='return', drop_na=True, ascending=True):
    # define the dataframe
    cols = list()
    cols.append(data)

    name = data.name
    for i in range(length):
        column_name = name+'_t-' + str(i + 1) + '_return'
        diff = pd.Series(data - data.shift(i + 1), name=column_name)
        cols.append(diff)
    agg = pd.concat(cols, axis=1)

    # drop records that have shifted nan values
    if drop_na:
        agg.dropna(inplace=True)
    # resort to descending if desired
    if not ascending:
        agg.sort_index(ascending=False, inplace=True)

    return agg

          

series_to_supervised_returns(btcusd_daily['Close'], 2)

Unnamed: 0_level_0,Close,Close_t-1_return,Close_t-2_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-03,7333.00,388.10,157.80
2020-01-04,7352.82,19.82,407.92
2020-01-05,7351.77,-1.05,18.77
2020-01-06,7762.72,410.95,409.90
2020-01-07,8158.50,395.78,806.73
...,...,...,...
2021-04-16,61436.65,-1801.01,-1535.59
2021-04-17,60068.01,-1368.64,-3169.65
2021-04-18,56292.67,-3775.34,-5143.98
2021-04-19,55691.79,-600.88,-4376.22


In [241]:
btcusd_daily['Close'].head()

Date
2020-01-01    7175.20
2020-01-02    6944.90
2020-01-03    7333.00
2020-01-04    7352.82
2020-01-05    7351.77
Freq: D, Name: Close, dtype: float64

In [226]:
btcusd_daily['Close'].name

'Close'