In [1]:
import os
import json
import logging
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils import (
    preprocess_data, 
    SequenceDataset, 
    score_model, 
    predict,
    get_predictions,
    plot_predictions)

In [2]:
saved_model_file = "models/LSTM_32unit_1layer_30seq.pt"
model = torch.load(saved_model_file)
model

LSTMRegression(
  (lstm): LSTM(704, 32, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=32, out_features=704, bias=True)
)

In [3]:
filename = "../data/mta_subway_221231_100wk_dbscan.parquet"
df = pd.read_parquet(filename)
df = df.fillna(0)
df.shape

(16800, 704)

In [4]:
df_train, df_test, features = preprocess_data(df)
df_test.shape

(3360, 704)

In [5]:
class SequenceDataset(Dataset):
    def __init__(self, dataframe, features, sequence_length=336, horizon_length=168, forecast_lead=1):
        self.features = features
        self.forecast_lead = forecast_lead
        self.sequence_length = sequence_length
        self.horizon_length = horizon_length
        self.X = torch.tensor(dataframe[features].iloc[:-self.forecast_lead,:].values).float()
        self.y = torch.tensor(dataframe[features].iloc[self.forecast_lead:,:].values).float()
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        if i > self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i+1), :]
            x = torch.cat((padding, x), 0)
        #y = self.X[i + self.forecast_lead]
        return x, self.y[i:(i+self.horizon_length)]

In [6]:
test_dataset = SequenceDataset(
        df_test,
        #target=None,
        features=features,
        sequence_length=10,
        horizon_length=2,
        forecast_lead=1
        )

In [7]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [8]:
def get_predictions(X,horizon_length):
    x_ = X
    model.eval()
    for i in range(horizon_length):
        y_ = model(x_)
        if i == 0:
            y_pred = y_
        else:
            y_pred = torch.cat((y_pred, y_),0)
        x_ = torch.cat((x_,y_pred.unsqueeze(0)),1)[:,1:,:]
    return y_pred

In [10]:
y_pred = None
for X, y in test_loader:
    y_out = get_predictions(X, y.shape[1])
    if y_pred == None:
        y_pred = y_out
    else:
        y_pred = torch.cat((y_pred, y_out),0)
    break

In [11]:
y_out.shape

torch.Size([2, 704])

In [12]:
y_pred.shape

torch.Size([2, 704])

In [49]:
y_pred.unsqueeze(0).shape

In [50]:
X.shape

In [51]:
y.shape

In [52]:
y_pred

In [57]:
y_pred_df = pd.DataFrame(y_pred.detach().numpy(), columns=df.columns)

In [58]:
y_df = pd.DataFrame(y.squeeze().numpy(), columns=df.columns)

In [62]:
y_pred_df.columns

In [None]:
y_pred_df['42 ST-BRYANT PK_ent

In [64]:
def plot_predictions(y_pred,y_true,station='7 AV'):
    ent = f"{station}_ent"
    ex = f"{station}_ex"
    y_pred[ent].plot(alpha=0.7,figsize=(15,5))
    y_true[ent].plot(alpha=0.7,figsize=(15,5))
    plt.show()

plot_predictions(y_pred_df, y_df, station='42 ST-BRYANT PK')

In [None]:
loss_function = nn.MSELoss()
test_score = score_model(test_loader, model, loss_function)  
test_score

In [None]:
def unnormalize(df):
    feat_stats = json.load(open("location_means_stds.json","rb"))
    mean_keys = [x for x in feat_stats if x[-5:] == '_mean']
    means = {x[:-5]:y for x, y in zip(mean_keys, [feat_stats[x] for x in mean_keys])}

    std_keys = [x for x in feat_stats if x[-4:] == '_std']
    stds = {x[:-4]:y for x, y in zip(std_keys, [feat_stats[x] for x in std_keys])}
    
    for x in df:
        df[x] = (df[x] * stds[x]) + means[x]
    
    return df

In [None]:
def get_predictions(data_loader, model, features):
    y_pred = predict(data_loader, model)
    df_out = pd.DataFrame(y_pred,columns=features)
    # need to rescale prediction values
    df_out = unnormalize(df_out)
    # need to add hour timestamp index for each row
    return df_out

df_out = get_predictions(test_loader, model, features)
df_out.head()

In [None]:
y_true = unnormalize(df_test)
y_true = y_true.iloc[:-1,:]

In [None]:
df_out.index = y_true.index

In [None]:
df_out['7 AV_ent'].plot(figsize=(10,5))

## Notes
- Currently, model just makes one sample prediction based on actual values obtained from training or test dataset. Need to set up to make stepwise prediction for an entire week following prior two weeks as model input. 
    - This can be used for model inference
    - Will give model scores more accurate to what would be used in inference as each sample prediction will be based on prior predicted samples, not actual values as given in training dataset

In [None]:
def get_predictions2(data_loader, model, features,output_length=168):
    y_pred = predict(data_loader, model)
    df_out = pd.DataFrame(y_pred,columns=features)
    # need to rescale prediction values
    df_out = unnormalize(df_out)
    # need to add hour timestamp index for each row
    return df_out

df_out = get_predictions(test_loader, model, features)
df_out.head()

In [None]:
def predict2(data_loader, model):
    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_star = model(X)
            output = torch.cat((output, y_star), 0)
    return output