In [None]:
import mlflow
from mlflow.entities import ViewType
import pandas as pd
from datetime import datetime
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima.model import ARIMA
from pandas import DataFrame
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns

In [None]:
EXPERIMENT_NAME = "ARIMA Shampoo"

In [None]:
mlflow.set_tracking_uri(uri="http://0.0.0.0:5000")

In [None]:
exp = mlflow.get_experiment_by_name(name=EXPERIMENT_NAME)
if exp is None:
    print(f"Experiment {EXPERIMENT_NAME} does not exist, creating.")
    exp_id = mlflow.create_experiment(name=EXPERIMENT_NAME)
else:
    exp_id = exp.experiment_id
    print(f"Found {EXPERIMENT_NAME} with id {exp_id}")

mlflow.set_experiment(experiment_id=exp_id)

In [None]:
def parser(x):
    return datetime.strptime("190" + x, "%Y-%m")

In [None]:
sns.set_theme(style="darkgrid")

In [None]:
df = pd.read_csv("../data/shampoo-sales.csv", header=0, index_col=0, parse_dates=[0], date_parser=parser)
df = df.squeeze("columns")

In [None]:
df.head()

In [None]:
sns.lineplot(data=df)

In [None]:
df.plot()
pyplot.show()

In [None]:
autocorrelation_plot(df)
pyplot.show()

In [None]:
df.index = df.index.to_period('M')

In [None]:
model = ARIMA(df, order=(5,1,0))
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())

In [None]:
# line plot of residuals
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

In [None]:
# density plot of residuals
residuals.plot(kind='kde')
pyplot.show()

In [None]:
# summary stats of residuals
print(residuals.describe())

In [None]:
# Optional autologging

# mlflow.statsmodels.autolog(log_models=True, disable=False, exclusive=False, disable_for_unsupported_versions=False, silent=False, registered_model_name=None)

In [None]:
# Rolling forecast
X = df.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:]
history = [x for x in train]
predictions = list()
p, d, q = (6,2,0)
with mlflow.start_run() as run:
    for t in test:
        model = ARIMA(history, order=(p,d,q))
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        history.append(t)
        print(f"predicted {yhat}, expected {t}")
    rmse = sqrt(mean_squared_error(test, predictions))
    print(f"Test RMSE: {rmse:.3f}")
    # Plot forecast against actual observations
    test_df = pd.DataFrame({"sale": test, "type": "actual"})
    pred_df = pd.DataFrame({"sale": predictions, "type": "predictions"})
    df_both = pd.concat([test_df, pred_df]).reset_index().rename(columns={"index": "month"})
    sns.lineplot(x="month", y="sale", data=df_both, hue="type")
    forecast_actual_fig = pyplot.gcf()
    # Log stuff
    mlflow.statsmodels.log_model(model_fit, "model")
    mlflow.log_metrics({"rmse": rmse})
    mlflow.log_params({"p": p, "d": d, "q": q})
    mlflow.log_figure(figure=forecast_actual_fig, artifact_file="figures/forecast_actual.png")
    mlflow.log_text(text=model_fit.summary().as_text(), artifact_file="stats/summary.txt")
    mlflow.log_dict(dictionary=df_both.to_dict(orient="records"), artifact_file="data/shampoo-sales_forecast.json")