In [None]:
from datetime import datetime, timedelta

import torch
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATS, NHITS, LSTM
from datasetsforecast.losses import mse, mae, rmse
from datasetsforecast.evaluation import accuracy

In [None]:
%matplotlib inline

In [None]:
plt.rcParams["figure.figsize"] = [12, 8]
plt.rcParams["figure.dpi"] = 100
plt.style.use("ggplot")

In [None]:
torch.cuda.is_available()

In [None]:
DAYS_TO_FORECAST = 30

### Data

In [None]:
stockprices_df = pl.read_parquet("../data/stock_prices/**/*.parquet")

In [None]:
stockprices_df.head()

In [None]:
filtered_pdf = stockprices_df.filter(
    pl.col("Ticker").is_in(["AAPL", "AMZN", "NFLX"])
).to_pandas()

sns.lineplot(filtered_pdf, x="Date", y="Close", hue="Ticker")

### Preprocessing

In [None]:
preprocessed_df = stockprices_df.select(
    pl.col("Date").cast(pl.Date).alias("ds"),
    pl.col("Ticker").alias("unique_id"),
    pl.col("Close").alias("y"),
)

preprocessed_df.head()

In [None]:
max_ds = preprocessed_df.select(pl.max("ds")).item()
cutoff_ds = max_ds - timedelta(days=DAYS_TO_FORECAST + 1)

train_df = preprocessed_df.filter(pl.col("ds") <= cutoff_ds)
test_df = preprocessed_df.filter(pl.col("ds") >= cutoff_ds)

### Forecast all the days from t=0

In [None]:
models = [
    LSTM(
        h=DAYS_TO_FORECAST,
        max_steps=1000,
        scaler_type="standard",
        encoder_hidden_size=64,
        decoder_hidden_size=64,
    ),
    NHITS(
        h=DAYS_TO_FORECAST,
        input_size=60,
        max_steps=1000,
        n_freq_downsample=[2, 1, 1],
    ),
    NBEATS(
        h=DAYS_TO_FORECAST,
        input_size=60,
        max_steps=1000,
        stack_types=["trend", "seasonality", "identity"],
        scaler_type="standard",
    ),
]
nf = NeuralForecast(models=models, freq="d")
nf.fit(df=train_df.to_pandas())

In [None]:
predict_df = nf.predict().reset_index()
predict_df.head()

In [None]:
test_validation_df = (
    pl.from_pandas(predict_df)
    .with_columns([pl.col("ds").cast(pl.Date).alias("ds")])
    .join(test_df, on=["ds", "unique_id"])
)
test_validation_df.head()

In [None]:
timeseries_with_forecasts_df = pl.concat(
    [
        train_df,
        test_validation_df,
    ],
    how="align",
)

timeseries_with_forecasts_df.head()
model_names = list(
    set(timeseries_with_forecasts_df.columns) - set(["ds", "unique_id", "y"])
)
model_names

In [None]:
for ticker in ["AAPL", "AMZN", "NFLX"]:
    ticker_df = timeseries_with_forecasts_df.filter(
        pl.col("unique_id") == ticker
    ).filter(pl.col("ds") >= cutoff_ds - timedelta(days=7))
    sns.lineplot(data=ticker_df.to_pandas()[["y", *model_names]])

In [None]:
evaluation_pdf = accuracy(test_validation_df, [mse, mae, rmse], agg_by=["unique_id"])
evaluation_pdf["best_model"] = evaluation_pdf.drop(
    columns=["metric", "unique_id"]
).idxmin(axis=1)

evaluation_df = pl.from_pandas(evaluation_pdf)
evaluation_df.head()

In [None]:
(
    evaluation_df
    .filter(pl.col("unique_id").is_in(["AAPL", "AMZN", "NFLX"]))
)

In [None]:
(
    evaluation_df
    .group_by("metric")
    .agg(*[pl.col(model_name).mean() for model_name in model_names])
)

### Forecast days incrementally

In [None]:
models = [
    # TODO: Fix to work with cross-validation
    #LSTM(
    #    h=5,
    #    max_steps=1000,
    #    scaler_type="standard",
    #    encoder_hidden_size=64,
    #    decoder_hidden_size=64,
    #    early_stop_patience_steps=2,
    #),
    NHITS(
        h=5,
        input_size=60,
        max_steps=1000,
        n_freq_downsample=[2, 1, 1],
        early_stop_patience_steps=2,
    ),
    NBEATS(
        h=5,
        input_size=30,
        max_steps=1000,
        stack_types=["trend", "seasonality", "identity"],
        scaler_type="standard",
        early_stop_patience_steps=2,
    ),
]
nf = NeuralForecast(models=models, freq="d")

In [None]:
temp_training_df = train_df
predictions_df = None

validation_set_size = 5_000

while True:
    nf.fit(df=temp_training_df.to_pandas(), val_size=validation_set_size)
    predict_df = (
        pl.from_pandas(nf.predict().reset_index())
        .with_columns([pl.col("ds").cast(pl.Date).alias("ds")])
    )

    next_predict_ds = predict_df.select(pl.col("ds").min()).item()
    next_data_ds = test_df.filter(pl.col("ds") > next_predict_ds).select(pl.col("ds").min()).item()
    print(f"Next predicted for {next_predict_ds}")
    print(f"Next available ds is for {next_data_ds}")

    predictions_until_next_ds = predict_df.filter(pl.col("ds") < next_data_ds)

    if predictions_df is None:
        predictions_df = predictions_until_next_ds
    else:
        predictions_df = pl.concat([predictions_df, predictions_until_next_ds], how="align")

    if not next_data_ds:
        break

    next_training_sample_df = test_df.filter(pl.col("ds") == next_data_ds)
    assert not next_training_sample_df.is_empty()
    temp_training_df = pl.concat([temp_training_df, next_training_sample_df], how="align")

predictions_df.head()

In [None]:
test_validation_df = (
    predictions_df
    .join(test_df, on=["ds", "unique_id"])
)
test_validation_df.head()

In [None]:
timeseries_with_forecasts_df = pl.concat(
    [
        train_df,
        test_validation_df,
    ],
    how="align",
)

timeseries_with_forecasts_df.head()
model_names = list(
    set(timeseries_with_forecasts_df.columns) - set(["ds", "unique_id", "y"])
)
model_names

In [None]:
for ticker in ["AAPL", "AMZN", "NFLX"]:
    ticker_df = timeseries_with_forecasts_df.filter(
        pl.col("unique_id") == ticker
    ).filter(pl.col("ds") >= cutoff_ds - timedelta(days=7))
    sns.lineplot(data=ticker_df.to_pandas()[["y", *model_names]])

In [None]:
evaluation_pdf = accuracy(test_validation_df, [mse, mae, rmse], agg_by=["unique_id"])
evaluation_pdf["best_model"] = evaluation_pdf.drop(
    columns=["metric", "unique_id"]
).idxmin(axis=1)

evaluation_df = pl.from_pandas(evaluation_pdf)
evaluation_df.head()

In [None]:
(
    evaluation_df
    .group_by("metric")
    .agg(*[pl.col(model_name).mean() for model_name in model_names])
)