In [1]:
import os
import argparse
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from utils import preprocess_data, SequenceDataset, train_model, score_model, log, get_predictions, plot_predictions
from model import LSTMRegression

In [2]:
df = pd.read_parquet("../data/mta_subway_221105_90wk_dbscan.parquet")
df.shape

(15120, 758)

In [3]:
df.head()

Unnamed: 0_level_0,1 AV_ent,1 AV_ex,103 ST-CORONA_ent,103 ST-CORONA_ex,103 ST_ent,103 ST_ex,104 ST_ent,104 ST_ex,110 ST_ent,110 ST_ex,...,WOODLAWN_ent,WOODLAWN_ex,WORLD TRADE CTR_ent,WORLD TRADE CTR_ex,WTC-CORTLANDT_ent,WTC-CORTLANDT_ex,YORK ST_ent,YORK ST_ex,ZEREGA AV_ent,ZEREGA AV_ex
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-06 00:00:00,13.25,60.25,243.5,37.75,14.25,38.5,30.5,4.25,32.0,24.25,...,40.75,9.5,1.75,0.75,2.25,7.75,0.0,0.0,0.0,11.0
2021-02-06 01:00:00,13.25,60.25,243.5,37.75,14.25,38.5,30.5,4.25,32.0,24.25,...,40.75,9.5,1.75,0.75,2.25,7.75,0.0,0.0,0.0,11.0
2021-02-06 02:00:00,13.25,60.25,243.5,37.75,14.25,38.5,30.5,4.25,32.0,24.25,...,40.75,9.5,1.75,0.75,2.25,7.75,0.0,0.0,0.0,11.0
2021-02-06 03:00:00,13.25,60.25,243.5,37.75,14.25,38.5,30.5,4.25,32.0,24.25,...,40.75,9.5,1.75,0.75,2.25,7.75,0.0,0.0,0.0,11.0
2021-02-06 04:00:00,13.25,60.25,243.5,37.75,14.25,38.5,30.5,4.25,32.0,24.25,...,40.75,9.5,1.75,0.75,2.25,7.75,0.0,0.0,0.0,11.0


In [4]:
forecast_lead=15
batch_size=32
sequence_length=30
learning_rate = 5e-5
num_hidden_units=16
num_layers=1
dropout=0
num_epochs=2

In [5]:
batch_size

32

In [6]:
df_train, df_test, features = preprocess_data(
        df,
        #target_feature, 
        forecast_lead=forecast_lead,
        train_test_split=0.8
        )

In [7]:
df_train.head(2)

Unnamed: 0_level_0,1 AV_ent,1 AV_ex,103 ST-CORONA_ent,103 ST-CORONA_ex,103 ST_ent,103 ST_ex,104 ST_ent,104 ST_ex,110 ST_ent,110 ST_ex,...,WOODLAWN_ent,WOODLAWN_ex,WORLD TRADE CTR_ent,WORLD TRADE CTR_ex,WTC-CORTLANDT_ent,WTC-CORTLANDT_ex,YORK ST_ent,YORK ST_ex,ZEREGA AV_ent,ZEREGA AV_ex
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-02-06 00:00:00,-1.288216,-1.425992,-0.553572,-1.148696,-0.407774,-0.126995,-0.620284,-0.999126,-1.110329,-1.498785,...,-1.005928,-1.396671,-0.991122,-1.176739,-1.125281,-1.220885,-1.11437,-1.28789,-1.066057,-1.1399
2021-02-06 01:00:00,-1.288216,-1.425992,-0.553572,-1.148696,-0.407774,-0.126995,-0.620284,-0.999126,-1.110329,-1.498785,...,-1.005928,-1.396671,-0.991122,-1.176739,-1.125281,-1.220885,-1.11437,-1.28789,-1.066057,-1.1399


In [8]:
train_dataset = SequenceDataset(
    df_train,
    #target=None,
    features=features,
    sequence_length=sequence_length
    )

test_dataset = SequenceDataset(
    df_test,
    #target=None,
    features=features,
    sequence_length=sequence_length
    )

In [9]:
train_dataset

<utils.SequenceDataset at 0x15089a460>

In [10]:
batch_size

32

In [11]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
train_dataset.__len__()

12081

In [13]:
train_dataset.__getitem__(10)

(tensor([[-1.2882, -1.4260, -0.5536,  ..., -1.2879, -1.0661, -1.1399],
         [-1.2882, -1.4260, -0.5536,  ..., -1.2879, -1.0661, -1.1399],
         [-1.2882, -1.4260, -0.5536,  ..., -1.2879, -1.0661, -1.1399],
         ...,
         [-0.7061, -0.9319,  0.7230,  ..., -1.2879, -0.2886, -1.0221],
         [-0.7061, -0.9319,  0.7230,  ..., -1.2879, -0.0337, -0.6230],
         [-0.7061, -0.9319,  0.7230,  ..., -1.2879, -0.0337, -0.6230]]),
 tensor([-1.1435, -1.4537, -1.2560, -0.7454, -0.4076, -0.1266, -0.9918, -0.7330,
         -1.2583, -1.2755, -1.0135, -0.6077, -0.8909, -1.1293, -0.1318, -0.0877,
         -0.0650, -0.0647, -0.2796, -0.1172, -1.1372, -0.8063, -1.3722, -1.4253,
         -1.2036, -1.3495, -1.1534, -1.3218, -0.9894, -1.1328, -1.3557, -1.0924,
         -1.2627, -1.3370, -0.0460, -0.1108,     nan,     nan, -0.1086, -0.0602,
         -1.3403, -1.3698,     nan,     nan, -1.3480, -1.3139, -1.2688, -1.0955,
         -1.1393, -1.1879, -1.3210, -0.9056, -0.9762, -1.0440, -1.2889, 

In [None]:
train_loader

In [14]:
for X, y in train_loader:
    print(X.shape)
    print(y.shape)
    print("")

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 758])
torch.Size([32, 758])

torch.Size([32, 30, 

In [None]:
y

In [None]:
X.shape

In [None]:
df.shape

In [None]:
y.shape