In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess


In [219]:
train = pd.read_csv(
    "datasets/train.csv.zip",
    compression="zip",
    parse_dates=["date"],
    usecols=["store_nbr", "family", "date", "sales"],
    dtype={"store_nbr": "category", "family": "category", "sales": "float32"},
    infer_datetime_format=True,
)
train.date = train.date.dt.to_period("D")
train = train.set_index(["store_nbr", "family", "date"]).sort_index()

In [220]:
oil = pd.read_csv("datasets/oil.csv", index_col="date", parse_dates=["date"])
oil["price"] = oil["dcoilwtico"]
oil.drop("dcoilwtico", inplace=True, axis="columns")
oil.price.fillna(method="bfill", inplace=True)


In [221]:
y = train.unstack(["store_nbr", "family"])
fourier = CalendarFourier(freq="M", order=3)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=2,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)

In [222]:
X = dp.in_sample()
# X.join(oil, on="date", how="left", rsuffix="oil")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

In [223]:
# start_date = datetime(2013, 1, 1)
# train["time"] = (train.date - start_date).dt.days
# train["day_of_week"] = train.date.dt.dayofweek
# train["day_of_month"] = train.date.dt.day
# train["month"] = train.date.dt.month
# train["category"] = encoder.transform(train.family)
# train.drop(["price"], axis="columns", inplace=True, errors="ignore")
# train = train.join(oil, on="date", how="left", rsuffix="oil")
# train.price.fillna(method="bfill", inplace=True)


In [224]:
# X = train.drop(["sales", "date", "family"], axis="columns")
# y = train.sales

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

In [225]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
# model = LinearRegression(normalize=True, n_jobs=-1)
model.fit(X_train, y_train)


RandomForestRegressor(n_jobs=-1)

In [226]:
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)
best = 202496.51205666756
print(f"Error is {rmse}, which is {'worse' if best<rmse else 'better'}")


Error is 276385.66750825004, which is worse


In [227]:
# test["time"] = (test.date - start_date).dt.days
# test["category"] = encoder.transform(test.family)
# test["day_of_week"] = test.date.dt.dayofweek
# test["day_of_month"] = test.date.dt.day
# test["month"] = test.date.dt.month
# test.drop(["price"], axis="columns", inplace=True, errors="ignore")
# test = test.join(oil, on="date", how="left", rsuffix="oil")
# test.price.fillna(method="bfill", inplace=True)
# X_test = test.drop(["date", "family"], axis="columns")
model.fit(X, y)
X_test = dp.out_of_sample(16)
# X_test.join(oil, how="left", rsuffix="oil")
X_test.index.name = "date"

In [228]:
test = pd.read_csv(
    "datasets/test.csv",
    parse_dates=["date"],
    dtype={
        "store_nbr": "category",
        "family": "category",
        "onpromotion": "uint32",
    },
    infer_datetime_format=True,
)
test.date = test.date.dt.to_period("D")
test = test.set_index(["store_nbr", "family", "date"]).sort_index()

In [229]:
y_test = model.predict(X_test)
submission = pd.DataFrame(y_test, index=X_test.index, columns=y.columns)
submission = submission.stack(["store_nbr", "family"])
submission = submission.join(test.id).reindex(columns=["id", "sales"])
submission.to_csv("datasets/submission.csv", index=False)
