In [2]:
from __future__ import annotations

import ast
import typing as T
import collections.abc as C

import numpy as np
import pandas as pd
import altair as alt
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot

In [3]:
def load_dataset(filepath: str) -> pd.DataFrame:
    def parse_values(x: pd.Series) -> pd.Series:
        keys = ast.literal_eval(x.loc["keys"])[0]
        values = (
            float(x) if x != "null" else pd.NA
            for x in ast.literal_eval(x.loc["values"])[0]
        )
        return pd.Series(dict(zip(keys, values)))

    df = pd.read_csv(filepath, parse_dates=[2], index_col=2).sort_index()
    values = df.apply(parse_values, axis=1)
    df = pd.concat([df[["id"]], values], axis=1)
    return df.convert_dtypes()

In [4]:
def fit_predict(
    df: pd.DataFrame, interval_width: float = 0.99, changepoint_range: float = 0.8
):
    model = Prophet().fit(df)
    forecast = model.predict(df)
    forecast["fact"] = df["y"].reset_index(drop=True)
    return forecast

In [5]:
def detect_anomalies(forecast: pd.DataFrame):
    forecasted = forecast[
        ["ds", "trend", "yhat", "yhat_lower", "yhat_upper", "fact"]
    ].copy()

    forecasted["anomaly"] = 0
    forecasted.loc[forecasted["fact"] > forecasted["yhat_upper"], "anomaly"] = 1
    forecasted.loc[forecasted["fact"] < forecasted["yhat_lower"], "anomaly"] = -1

    forecasted["importance"] = 0.0
    forecasted.loc[forecasted["anomaly"] == 1, "importance"] = (
        forecasted["fact"] - forecasted["yhat_upper"]
    ) / forecasted["fact"]
    forecasted.loc[forecasted["anomaly"] == -1, "importance"] = (
        forecasted["yhat_lower"] - forecasted["fact"]
    ) / forecasted["fact"]

    return forecasted

In [6]:
def plot_anomalies(forecasted: pd.DataFrame):
    interval = (
        alt.Chart(forecasted)
        .mark_area(interpolate="basis", color="#7FC97F")
        .encode(
            x=alt.X("ds:T", title="date"),
            y="yhat_upper",
            y2="yhat_lower",
            tooltip=["ds", "fact", "yhat_lower", "yhat_upper"],
        )
        .interactive()
        .properties(title="Anomaly Detection")
    )

    fact = (
        alt.Chart(forecasted[forecasted["anomaly"] == 0])
        .mark_circle(size=15, opacity=0.7, color="Black")
        .encode(
            x="ds:T",
            y=alt.Y("fact", title="Sales"),
            tooltip=["ds", "fact", "yhat_lower", "yhat_upper"],
        )
        .interactive()
    )

    anomalies = (
        alt.Chart(forecasted[forecasted["anomaly"] != 0])
        .mark_circle(size=30, color="Red")
        .encode(
            x="ds:T",
            y=alt.Y("fact", title="Sales"),
            tooltip=["ds", "fact", "yhat_lower", "yhat_upper"],
            size=alt.Size("importance", legend=None),
        )
        .interactive()
    )

    return (
        alt.layer(interval, fact, anomalies)
        .properties(width=870, height=450)
        .configure_title(fontSize=20)
    )

In [8]:
X_train = load_dataset("data/train999.csv")

df = pd.DataFrame({
    "ds": X_train.index,
    "y": X_train["meteo_humidity"].values,
})

m = Prophet().fit(df)
forecast = m.predict(df)
forecast["fact"] = df["y"].reset_index(drop=True)

pred = detect_anomalies(forecast)
plot_anomalies(pred)

# fig = m.plot(forecast)
# add_changepoints_to_plot(fig.gca(), m, forecast)

00:38:29 - cmdstanpy - INFO - Chain [1] start processing
00:38:29 - cmdstanpy - INFO - Chain [1] done processing


In [59]:
def run_all_cols(dataset: pd.DataFrame):
    result = dataset.copy()
    for name, series in dataset.items():
        if name == "id":
            continue
        df = pd.DataFrame({
            "ds": dataset.index,
            "y": series.values,
        })
        m = Prophet().fit(df)
        forecast = m.predict(df)
        forecast["fact"] = df["y"].reset_index(drop=True)
        pred = detect_anomalies(forecast)
        plot_anomalies(pred)
        result[[f"anomaly_{name}"]] = pred[['anomaly']].replace({-1: 1, 1: 1}).values
    return result

In [55]:
def compile_results(df: pd.DataFrame):
    target_cols = [x for x in df.columns if x.startswith("anomaly_")]
    df["target"] = df[target_cols].agg(lambda x: "[%s]" % ", ".join(map(str, x)), axis=1)
    return df.set_index("id")

In [60]:
X_test = load_dataset("data/test999.csv")
result = run_all_cols(X_test)
compiled = compile_results(result)
compiled["target"].to_csv("out/prophet/prophet-targets-test.csv")

00:31:08 - cmdstanpy - INFO - Chain [1] start processing
00:31:08 - cmdstanpy - INFO - Chain [1] done processing
[   0.3713487158734287,   0.34067137971470635,   0.29439792850142893,
  0.021779141067912677,   0.05248347325000158,   0.29965348337106174,
     0.590312409492675,   0.34028630668002796,   0.33814696797294025,
    0.3774000611493973,    0.3336237406410307,   0.31226493034092595,
    0.3249084304099979,   0.07739297205197393,    0.3789266307370658,
    0.3920245213168652,    0.3705968862018862,  0.010062914080300245,
  0.042316199637524865,   0.34127045131328876,   0.34197067754970445,
    0.5300731060347452,  0.049323228509148054, 0.0013251444117997613,
   0.33638275995443334,    0.5901035316037552,    0.3100044141940261,
      0.36395696312703,    0.5848880588715352,  0.004433685558121403,
    0.2854712360043276,   0.32945884018608734,    0.5845102920585601,
   0.28378379395805897,   0.33806012086518583,    0.3536644571590469,
   0.36799443800074094,     0.363097316456946, 