In [31]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.optim.adam import Adam

from pytorch_lightning import Trainer
from pytorch_lightning import LightningModule, LightningDataModule
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Multiple Time Periods

What if we used more than just the most recent pricing details into account when predicting the next close?
We will build a `torch.utils.data.Dataset` to batch our data into multiple time periods per batch then test our models ability to predict the `NextClose`.

In [2]:
df = pd.read_csv('../data/processed/eth_hourly.csv')
df.head()

Unnamed: 0,TimeStamp,open,high,low,CurrentClose,Volume_ETH,NextClose
0,2018-05-15 06:00:00,733.12,736.48,731.19,733.04,5782.27,734.64
1,2018-05-15 07:00:00,733.04,735.99,731.7,734.64,2785.61,731.32
2,2018-05-15 08:00:00,734.64,734.65,722.0,731.32,10826.46,728.44
3,2018-05-15 09:00:00,731.32,732.0,728.44,728.44,2889.59,735.21
4,2018-05-15 10:00:00,728.44,739.3,725.52,735.21,9822.41,732.1


In [12]:
class PricingDataModule(LightningDataModule):
    
    def __init__(
        self, 
        path_to_csv: str,
        batch_size: int, 
        time_periods_to_batch: int = 6, 
        train_size: float = 0.7, 
        test_size: float = 0.2
    ):
        super().__init__()
        
        self.path_to_csv = path_to_csv
        self.batch_size = batch_size
        self.time_periods_to_batch = time_periods_to_batch
        
        assert train_size + test_size <= 1, f"sum of train and test are greater than 1: train_size: {train_size}\ntest_size: {test_size}"
        self.train_size = train_size
        self.test_size = test_size
        
    def prepare_data(self):
        print('In prepare_data')
        
    def setup(self):
        
        dataframe = pd.read_csv(self.path_to_csv)
        try:
            dataframe.drop('TimeStamp', axis=1, inplace=True)
        except KeyError:
            pass
        pricing_data = dataframe.values
        self.number_of_rows = pricing_data.shape[0]
        self.num_samples = self.number_of_rows - self.time_periods_to_batch - 1
        
        train_samples = int(self.train_size * self.num_samples)
        test_samples = int(self.test_size * self.num_samples)
        
        features = []
        targets = []
        for step, index in enumerate(range(self.num_samples)):
            lower_index = index
            upper_index = lower_index + self.time_periods_to_batch
            pricing_features = pricing_data[lower_index:upper_index, :-1]
            next_close_price = pricing_data[upper_index, -1]
            
            features.append(pricing_features)
            targets.append(next_close_price)
        
        features = torch.FloatTensor(features)
        targets = torch.FloatTensor(targets)
        
        train_features = features[0:train_samples]
        train_targets = targets[0:train_samples]
        
        test_features = features[train_samples:train_samples+test_samples]
        test_targets = targets[train_samples:train_samples+test_samples]
        
        if train_samples + test_samples < self.num_samples:
            val_features = features[train_samples+test_samples:]
            val_targets = targets[train_samples+test_samples:]
        
        self.train_dataset = TensorDataset(train_features, train_targets)
        self.test_dataset = TensorDataset(test_features, test_targets)
        self.val_dataset = TensorDataset(val_features, val_targets)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)

In [13]:
class ConvModel(LightningModule):
    def __init__(self, in_channels: int = 6, out_channels: int = 6):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        
        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3)
        self.linear1 = nn.Linear(in_features=out_channels, out_features=1)
        self.loss_fn = nn.L1Loss()
        
    def forward(self, x):
        x = F.gelu(F.max_pool1d(self.conv1(x), 1))
        x = F.gelu(F.max_pool1d(self.conv2(x), 1))
        x = x.squeeze(-1)
        x = self.linear1(x)
        return x
    
    def _shared_pass(self, batch):
        feature, target = batch
        model_prediction = self(feature)
        loss = self.loss_fn(model_prediction, target.reshape(-1, 1))
        return loss
    
    def training_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('validation_loss', loss, on_step=False, on_epoch=True)
        self.log('batch_size', batch[0].shape[0], on_epoch=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('test_loss', loss, on_step=False, on_epoch=True)
        return loss
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=3e-5)

In [22]:
class LinearModel(LightningModule):
    def __init__(self, in_features: int = 6, out_features: int = 1):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        self.linear1 = nn.Linear(in_features=in_features, out_features=64)
        self.linear2 = nn.Linear(in_features=64, out_features=64)
        self.linear3 = nn.Linear(in_features=64, out_features=64)
        self.linear4 = nn.Linear(in_features=64, out_features=out_features)
        self.loss_fn = nn.L1Loss()
        
    def forward(self, x):
        x = F.gelu(self.linear1(x))
        x = F.gelu(self.linear2(x))
        x = F.gelu(self.linear3(x))
        x = self.linear4(x)
        return x
    
    def _shared_pass(self, batch):
        feature, target = batch
        model_prediction = self(feature)
        loss = self.loss_fn(model_prediction, target.reshape(-1, 1))
        return loss
    
    def training_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('validation_loss', loss, on_step=False, on_epoch=True)
        self.log('batch_size', batch[0].shape[0], on_epoch=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        loss = self._shared_pass(batch)
        self.log('test_loss', loss, on_step=False, on_epoch=True)
        return loss
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=3e-5)

In [26]:
hparams = {
    'time_periods_to_batch': 1,
    'batch_size': 32,
    'epochs': 25
}

In [27]:
pricing_dm = PricingDataModule(
    path_to_csv='../data/processed/eth_hourly.csv', 
    batch_size=hparams['batch_size'], 
    time_periods_to_batch=hparams['time_periods_to_batch']
)
pricing_dm.setup()

linear_model = LinearModel(
    in_features=5,
    out_features=1
)

lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(
    max_epochs=hparams['epochs'], 
    auto_lr_find=True, 
    auto_scale_batch_size=True,
    callbacks=[lr_monitor]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [28]:
trainer.fit(linear_model, pricing_dm)


  | Name    | Type   | Params
-----------------------------------
0 | linear1 | Linear | 384   
1 | linear2 | Linear | 4.2 K 
2 | linear3 | Linear | 4.2 K 
3 | linear4 | Linear | 65    
4 | loss_fn | L1Loss | 0     
-----------------------------------
8.8 K     Trainable params
0         Non-trainable params
8.8 K     Total params
0.035     Total estimated model params size (MB)


In prepare_data


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [29]:
trainer.test(linear_model, pricing_dm.test_dataloader(), )



Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 226.43081665039062}
--------------------------------------------------------------------------------


[{'test_loss': 226.43081665039062}]

In [32]:
def score_metrics(y_true, y_pred):
    return {
        'mae': mean_absolute_error(y_true, y_pred),
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'r2': r2_score(y_true, y_pred)
    }

In [49]:
train_preds = []
train_true = []
for batch in pricing_dm.train_dataloader():
    features, targets = batch
    out = linear_model(features)
    train_preds.extend(out.reshape(-1).tolist())
    train_true.extend(targets.tolist())
    
y_preds = []
y_true = []
for batch in pricing_dm.test_dataloader():
    features, targets = batch
    out = linear_model(features)
    y_preds.extend(out.reshape(-1).tolist())
    y_true.extend(targets.tolist())

In [50]:
score_metrics(train_true, train_preds)

{'mae': 37.8759149904634,
 'mse': 2341.00168527853,
 'rmse': 48.38389902931067,
 'r2': 0.8202880559264211}

In [51]:
score_metrics(y_true, y_preds)

{'mae': 217.96808408849378,
 'mse': 72827.80784731387,
 'rmse': 269.86627771419285,
 'r2': 0.827981089052144}