In [152]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader

## Multiple Time Periods

What if we used more than just the most recent pricing details into account when predicting the next close?
We will build a `torch.utils.data.Dataset` to batch our data into multiple time periods per batch then test our models ability to predict the `NextClose`.

In [153]:
df = pd.read_csv('../data/processed/eth_hourly.csv')
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
df.sort_values(by='TimeStamp', ascending=True, inplace=True)
df.head()

Unnamed: 0,TimeStamp,Open,High,Low,CurrentClose,Volume_USD,NextClose
0,2018-05-15 06:00:00,733.12,736.48,731.19,733.04,4246576.84,734.64
1,2018-05-15 07:00:00,733.04,735.99,731.7,734.64,2044880.32,731.32
2,2018-05-15 08:00:00,734.64,734.65,722.0,731.32,7891317.14,728.44
3,2018-05-15 09:00:00,731.32,732.0,728.44,728.44,2111099.12,735.21
4,2018-05-15 10:00:00,728.44,739.3,725.52,735.21,7197617.75,732.1


In [238]:
class PricingDataset(Dataset):
    """ Dataset for batching Time Series Pricing Data """
    
    def __init__(
        self, 
        pricing_dataframe: pd.DataFrame, 
        time_periods_to_batch: int = 6
    ):
        self.time_periods_to_batch = time_periods_to_batch
        pricing_dataframe.columns = [c.lower() for c in pricing_dataframe.columns]
        if 'timestamp' in pricing_dataframe.columns:
             pricing_dataframe.drop('timestamp', axis=1, inplace=True)   
        self.pricing_data = pricing_dataframe.values
        self.number_of_rows = self.pricing_data.shape[0]
        self.num_samples = self.number_of_rows - self.time_periods_to_batch - 1
        
        features = []
        targets = []
        for step, index in enumerate(range(self.num_samples)):
            lower_index = index
            upper_index = lower_index + self.time_periods_to_batch
            pricing_features = self.pricing_data[lower_index:upper_index, :-1]
            next_close_price = self.pricing_data[upper_index, -1]
            
            features.append(pricing_features)
            targets.append(next_close_price)

        self.features = features
        self.targets = targets
            
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        target = self.targets[idx]
        return feature, target

In [239]:
eth_data = PricingDataset(pricing_dataframe=df, time_periods_to_batch=6)

In [240]:
len(eth_data)

24251

In [241]:
features, target = eth_data[24250]

In [242]:
features, target

(array([[1.90980000e+03, 1.92290000e+03, 1.90870000e+03, 1.92230000e+03,
         2.60850685e+06],
        [1.92230000e+03, 1.92273073e+03, 1.91430000e+03, 1.91720000e+03,
         3.40387443e+06],
        [1.91753724e+03, 1.93670000e+03, 1.91210000e+03, 1.92640000e+03,
         1.12592210e+07],
        [1.92620000e+03, 1.93780000e+03, 1.91729717e+03, 1.92220000e+03,
         3.17131398e+06],
        [1.92220000e+03, 1.94070000e+03, 1.91410000e+03, 1.93700000e+03,
         5.95130565e+06],
        [1.93690000e+03, 1.94884597e+03, 1.93510000e+03, 1.94450000e+03,
         5.80859868e+06]]),
 1937.5)