In [3]:
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

In [2]:
comp_dir = Path("datasets")

holidays_events = pd.read_csv(
    comp_dir / "holidays_events.csv",
    dtype={
        "type": "category",
        "locale": "category",
        "locale_name": "category",
        "description": "category",
        "transferred": "bool",
    },
    parse_dates=["date"],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index("date").to_period("D")

store_sales = pd.read_csv(
    comp_dir / "train.csv.zip",
    usecols=["store_nbr", "family", "date", "sales"],
    dtype={
        "store_nbr": "category",
        "family": "category",
        "sales": "float32",
    },
    parse_dates=["date"],
    infer_datetime_format=True,
    compression="zip",
)
store_sales["date"] = store_sales.date.dt.to_period("D")
store_sales = store_sales.set_index(["store_nbr", "family", "date"]).sort_index()
average_sales = store_sales.groupby("date").mean().squeeze().loc["2017"]


In [4]:
y = store_sales.unstack(["store_nbr", "family"]).loc["2017"]

# Create training data
fourier = CalendarFourier(freq="M", order=3)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=2,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()
# X["NewYear"] = X.index.dayofyear == 1
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)


In [5]:
df_test = pd.read_csv(
    comp_dir / "test.csv",
    dtype={
        "store_nbr": "category",
        "family": "category",
        "onpromotion": "uint32",
    },
    parse_dates=["date"],
    infer_datetime_format=True,
)
df_test["date"] = df_test.date.dt.to_period("D")
df_test = df_test.set_index(["store_nbr", "family", "date"]).sort_index()

# Create features for test set
X_test = dp.out_of_sample(steps=16)
X_test.index.name = "date"
# X_test["NewYear"] = X_test.index.dayofyear == 1


In [6]:
y_submit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
y_submit = y_submit.stack(["store_nbr", "family"])
y_submit = y_submit.join(df_test.id).reindex(columns=["id", "sales"])
y_submit.to_csv(comp_dir / "submission.csv", index=False)
