In [13]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.3.5-py3-none-any.whl (808 kB)
[K     |████████████████████████████████| 808 kB 3.0 MB/s eta 0:00:01
[?25hCollecting torchmetrics>=0.2.0
  Downloading torchmetrics-0.3.2-py3-none-any.whl (274 kB)
[K     |████████████████████████████████| 274 kB 56.1 MB/s eta 0:00:01
Collecting pyDeprecate==0.3.0
  Downloading pyDeprecate-0.3.0-py3-none-any.whl (10 kB)
Collecting tqdm>=4.41.0
  Downloading tqdm-4.61.1-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 12.9 MB/s  eta 0:00:01
[?25hCollecting PyYAML<=5.4.1,>=5.1
  Downloading PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl (788 kB)
[K     |████████████████████████████████| 788 kB 23.2 MB/s eta 0:00:01
[?25hCollecting tensorboard!=2.5.0,>=2.2.0
  Downloading tensorboard-2.4.1-py3-none-any.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 24.9 MB/s eta 0:00:01    |█████████████████████▍          | 7.1 MB 24.9 MB/s eta 0:00:01
[?25h

In [22]:
import os
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader

from pytorch_lightning import LightningModule, LightningDataModule

## Multiple Time Periods

What if we used more than just the most recent pricing details into account when predicting the next close?
We will build a `torch.utils.data.Dataset` to batch our data into multiple time periods per batch then test our models ability to predict the `NextClose`.

In [31]:
df = pd.read_csv('../data/processed/eth_hourly.csv')
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
df.sort_values(by='TimeStamp', ascending=True, inplace=True)
df.head()

Unnamed: 0,TimeStamp,Open,High,Low,CurrentClose,Volume_USD,NextClose
0,2018-05-15 06:00:00,733.12,736.48,731.19,733.04,4246576.84,734.64
1,2018-05-15 07:00:00,733.04,735.99,731.7,734.64,2044880.32,731.32
2,2018-05-15 08:00:00,734.64,734.65,722.0,731.32,7891317.14,728.44
3,2018-05-15 09:00:00,731.32,732.0,728.44,728.44,2111099.12,735.21
4,2018-05-15 10:00:00,728.44,739.3,725.52,735.21,7197617.75,732.1


In [24]:
class PricingDataset(Dataset):
    """ Dataset for batching Time Series Pricing Data """
    
    def __init__(
        self, 
        pricing_dataframe: pd.DataFrame, 
        time_periods_to_batch: int = 6
    ):
        self.time_periods_to_batch = time_periods_to_batch
        pricing_dataframe.columns = [c.lower() for c in pricing_dataframe.columns]
        if 'timestamp' in pricing_dataframe.columns:
             pricing_dataframe.drop('timestamp', axis=1, inplace=True)   
        self.pricing_data = pricing_dataframe.values
        self.number_of_rows = self.pricing_data.shape[0]
        self.num_samples = self.number_of_rows - self.time_periods_to_batch - 1
        
        features = []
        targets = []
        for step, index in enumerate(range(self.num_samples)):
            lower_index = index
            upper_index = lower_index + self.time_periods_to_batch
            pricing_features = self.pricing_data[lower_index:upper_index, :-1]
            next_close_price = self.pricing_data[upper_index, -1]
            
            features.append(pricing_features)
            targets.append(next_close_price)

        self.features = features
        self.targets = targets
            
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        target = self.targets[idx]
        return feature, target

In [25]:
eth_data = PricingDataset(pricing_dataframe=df, time_periods_to_batch=6)

In [26]:
len(eth_data)

24251

In [27]:
features, target = eth_data[0]

In [28]:
features, target

(array([[7.33120000e+02, 7.36480000e+02, 7.31190000e+02, 7.33040000e+02,
         4.24657684e+06],
        [7.33040000e+02, 7.35990000e+02, 7.31700000e+02, 7.34640000e+02,
         2.04488032e+06],
        [7.34640000e+02, 7.34650000e+02, 7.22000000e+02, 7.31320000e+02,
         7.89131714e+06],
        [7.31320000e+02, 7.32000000e+02, 7.28440000e+02, 7.28440000e+02,
         2.11109912e+06],
        [7.28440000e+02, 7.39300000e+02, 7.25520000e+02, 7.35210000e+02,
         7.19761775e+06],
        [7.35210000e+02, 7.36840000e+02, 7.30000000e+02, 7.32100000e+02,
         4.08949840e+06]]),
 729.61)

In [29]:
class PricingDataModule(LightningDataModule):
    
    def __init__(
        self, 
        path_to_csv: str, 
        batch_size: int, 
        time_periods_to_batch: int = 6, 
        train_size: float = 0.7, 
        test_size: float = 0.2
    ):
        import os
        assert os.path.isfile(path_to_csv), f"provided file `path_to_csv` does not exist: {path_to_csv}"
        self.path_to_csv = path_to_csv
        self.batch_size = batch_size
        self.time_periods_to_batch = time_periods_to_batch
        
        assert train_size + test_size <= 1, f"sum of train and test are greater than 1: train_size: {train_size}\ntest_size: {test_size}"
        self.train_size = train_size
        self.test_size = test_size
        
    def setup(self):
        
        pricing_data = pd.read_csv(self.path_to_csv)
        pricing_data.columns = [c.lower() for c in pricing_data.columns]

        
        pricing_data = pricing_data.values
        self.number_of_rows = self.pricing_data.shape[0]
        self.num_samples = self.number_of_rows - self.time_periods_to_batch - 1
        
        train_samples = int(self.train_size * self.num_samples)
        test_samples = int(self.test_size * self.num_samples)
        
        features = []
        targets = []
        for step, index in enumerate(range(self.num_samples)):
            lower_index = index
            upper_index = lower_index + self.time_periods_to_batch
            pricing_features = pricing_data[lower_index:upper_index, :-1]
            next_close_price = pricing_data[upper_index, -1]
            
            features.append(pricing_features)
            targets.append(next_close_price)
        
        features = torch.from_numpy(features)
        targets = torch.from_numpy(targets)
        
        train_features = features[0:train_samples]
        train_targets = targets[0:train_samples]
        
        test_features = features[train_samples:train_samples+test_samples]
        test_targets = targets[train_samples:train_samples+test_samples]
        
        if train_samples + test_samples < self.num_samples:
            val_features = features[train_samples+test_samples:]
            val_targets = targets[train_samples+test_samples:]
        
        self.train_dataset = TensorDataset((train_features, train_targets))
        self.test_dataset = TensorDataset((test_features, test_targets))
        self.val_dataset = TensorDataset((val_features, val_targets))

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

In [16]:
pricing_dm = PricingDataModule(path_to_csv='../data/processed/eth_hourly.csv')

array([[7.33120000e+02, 7.36480000e+02, 7.31190000e+02, 7.33040000e+02,
        4.24657684e+06],
       [7.33040000e+02, 7.35990000e+02, 7.31700000e+02, 7.34640000e+02,
        2.04488032e+06],
       [7.34640000e+02, 7.34650000e+02, 7.22000000e+02, 7.31320000e+02,
        7.89131714e+06],
       [7.31320000e+02, 7.32000000e+02, 7.28440000e+02, 7.28440000e+02,
        2.11109912e+06],
       [7.28440000e+02, 7.39300000e+02, 7.25520000e+02, 7.35210000e+02,
        7.19761775e+06],
       [7.35210000e+02, 7.36840000e+02, 7.30000000e+02, 7.32100000e+02,
        4.08949840e+06]])

In [32]:
df

Unnamed: 0,TimeStamp,Open,High,Low,CurrentClose,Volume_USD,NextClose
0,2018-05-15 06:00:00,733.12,736.480000,731.190000,733.04,4.246577e+06,734.64
1,2018-05-15 07:00:00,733.04,735.990000,731.700000,734.64,2.044880e+06,731.32
2,2018-05-15 08:00:00,734.64,734.650000,722.000000,731.32,7.891317e+06,728.44
3,2018-05-15 09:00:00,731.32,732.000000,728.440000,728.44,2.111099e+06,735.21
4,2018-05-15 10:00:00,728.44,739.300000,725.520000,735.21,7.197618e+06,732.10
...,...,...,...,...,...,...,...
24253,2021-02-18 19:00:00,1926.20,1937.800000,1917.297174,1922.20,3.171314e+06,1937.00
24254,2021-02-18 20:00:00,1922.20,1940.700000,1914.100000,1937.00,5.951306e+06,1944.50
24255,2021-02-18 21:00:00,1936.90,1948.845968,1935.100000,1944.50,5.808599e+06,1937.70
24256,2021-02-18 22:00:00,1944.40,1946.900000,1932.200000,1937.70,2.336445e+06,1937.50
