In [0]:
!pip install -q mlflow lightgbm

import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
%matplotlib inline

R = "/kaggle/input/store-sales-time-series-forecasting"
ex = "x_expt_01"
mdl = "mdl_store_sales_demo"

mlflow.set_tracking_uri("file:///kaggle/working/mlruns")
mlflow.set_experiment(ex)

tr = pd.read_csv(f"{R}/train.csv")
ts = pd.read_csv(f"{R}/test.csv")
o  = pd.read_csv(f"{R}/oil.csv")
h  = pd.read_csv(f"{R}/holidays_events.csv")
st = pd.read_csv(f"{R}/stores.csv")
tn = pd.read_csv(f"{R}/transactions.csv")
ss = pd.read_csv(f"{R}/sample_submission.csv")




for d in [tr,ts,o,h,tn]:
 d["date"]=pd.to_datetime(d["date"])

tr["dow"]=tr["date"].dt.dayofweek
ts["dow"]=ts["date"].dt.dayofweek
tr["wek"]=tr["dow"].isin([5,6]).astype(int)
ts["wek"]=ts["dow"].isin([5,6]).astype(int)

fd = pd.date_range(tr["date"].min(), ts["date"].max(), freq="D")
o = o.set_index("date").reindex(fd)
o.index.name="date"
o["dcoilwtico"]=o["dcoilwtico"].ffill().bfill()
o["dcoilwtico"]=o["dcoilwtico"].fillna(o["dcoilwtico"].median())
o=o.reset_index().rename(columns={"dcoilwtico":"oil"})

h["hol"]=(h["type"]!="Work Day").astype(int)
h1=h.groupby("date")["hol"].max().reset_index()

def f(d):
 return int(d.day in [15,30])

tr["sal"]=tr["date"].map(f); ts["sal"]=ts["date"].map(f)
e=pd.to_datetime("2016-04-16")
for z in [tr,ts]:
 z["eq"] = ((z["date"]>=e-pd.Timedelta(15,"D")) & (z["date"]<=e+pd.Timedelta(15,"D"))).astype(int)

tr=tr.merge(o,on="date",how="left")
ts=ts.merge(o,on="date",how="left")
tr=tr.merge(h1,on="date",how="left")
ts=ts.merge(h1,on="date",how="left")
tr["hol"]=tr["hol"].fillna(0).astype(int)
ts["hol"]=ts["hol"].fillna(0).astype(int)






for c in ["city","state","type","cluster"]:
 st[c]=st[c].astype(str)
 m={v:i for i,v in enumerate(st[c].unique())}
 st[c]=st[c].map(m).astype(int)

tr=tr.merge(st,on="store_nbr",how="left")
ts=ts.merge(st,on="store_nbr",how="left")

tr=tr.merge(tn,on=["store_nbr","date"],how="left")
ts=ts.merge(tn,on=["store_nbr","date"],how="left")
tr["transactions"]=tr["transactions"].fillna(0)
ts["transactions"]=ts["transactions"].fillna(0)

tr=tr.sort_values(["store_nbr","family","date"])
tr["z"]= (tr["sales"]==0).astype(int)
zz=[]; r=0; ps=None; pf=None





for _,w in tr[["store_nbr","family","z"]].iterrows():
 if w["store_nbr"]!=ps or w["family"]!=pf:
  r=0
 if w["z"]==1: r+=1
 else: r=0
 zz.append(r)
 ps=w["store_nbr"]; pf=w["family"]

tr["zr"]=zz
tr["sc"]=(tr["zr"]>=14).astype(int)
ts["sc"]=0

tr["family"]=tr["family"].astype(str)
ts["family"]=ts["family"].astype(str)
fmap={v:i for i,v in enumerate(pd.concat([tr["family"],ts["family"]]).unique())}
tr["fam"]=tr["family"].map(fmap)
ts["fam"]=ts["family"].map(fmap)

fc = ["fam","store_nbr","city","state","type","cluster",
      "oil","hol","sal","eq","transactions","sc","dow","wek"]

X=tr[fc]
y=tr["sales"].astype(float)
Xt=ts[fc]

cut = tr["date"].max() - pd.Timedelta(28,"D")
m1 = tr["date"]<=cut
m2 = tr["date"]>cut
Xtr = X[m1]
ytr=y[m1]
Xv  = X[m2]
yv=y[m2]
ytrlog=np.log1p(ytr)
yvlog=np.log1p(yv)



mlflow.lightgbm.autolog()

with mlflow.start_run(run_name="r1") as rr:
 p={"n_estimators":1000,"learning_rate":0.03,"num_leaves":64,
    "min_data_in_leaf":50,"feature_fraction":0.8,
    "bagging_fraction":0.8,"bagging_freq":3,"random_state":42}
 mdl1=lgb.LGBMRegressor(**p)
 mdl1.fit(Xtr,ytrlog,eval_set=[(Xtr,ytrlog),(Xv,yvlog)],eval_metric="rmse")
 vp = mdl1.predict(Xv)
 vp = np.expm1(vp).clip(0,None)
 sc = np.sqrt(mean_squared_log_error(yv,vp))
 mlflow.log_metric("rmsle",sc)
 print("rmsle",sc)
 rid = rr.info.run_id
 muri=f"runs:/{rid}/model"

rv=None
try:
 x = mlflow.register_model(muri,mdl)
 rv=x.version
 print("reg",rv)
except:
 print("no registry")

if rv: loadu=f"models:/{mdl}/{rv}"
else: loadu=muri

m2load = mlflow.pyfunc.load_model(loadu)
vp2 = m2load.predict(Xv)
vp2 = np.expm1(vp2).clip(0,None)
print("diff",np.abs(vp - vp2).mean())

vv = tr[m2].copy()
vv["p"]=vp
d1 = vv.groupby("date")[["sales","p"]].sum().reset_index()





plt.figure(figsize=(12,5))
plt.plot(d1["date"],d1["sales"])
plt.plot(d1["date"],d1["p"])
plt.title("daily")
plt.grid()
plt.show()

d1["r"]=d1["sales"]-d1["p"]
plt.figure(figsize=(12,4))
plt.plot(d1["date"],d1["r"])
plt.axhline(0,color="black")
plt.title("res")
plt.grid()
plt.show()

plt.hist(d1["r"],bins=50)
plt.title("hist")
plt.grid()
plt.show()

vv["wk"]=vv["date"].dt.to_period("W").dt.start_time
d2=vv.groupby("wk")[["sales","p"]].sum().reset_index()

plt.figure(figsize=(12,4))
plt.plot(d2["wk"],d2["sales"])
plt.plot(d2["wk"],d2["p"])
plt.title("wk")
plt.grid()
plt.show()

top = vv.groupby("family")["sales"].sum().nlargest(6).index

for fml in top:
 x=vv[vv["family"]==fml].copy()
 x=x.groupby("date")[["sales","p"]].sum().reset_index()
 plt.figure(figsize=(10,3))
 plt.plot(x["date"],x["sales"])
 plt.plot(x["date"],x["p"])
 plt.title(str(fml))
 plt.grid()
 plt.show()

imp = pd.DataFrame({"f":fc,"i":mdl1.feature_importances_}).sort_values("i")
plt.figure(figsize=(8,6))
plt.barh(imp["f"],imp["i"])
plt.title("imp")
plt.grid(axis="x")
plt.show()

tp = m2load.predict(Xt)
tp = np.expm1(tp).clip(0,None)
g = ts[["date","store_nbr","family"]].copy()
g["predicted_sales"]=tp
g.to_csv("gold_store_family_day_predictions.csv",index=False)
print("saved gold")
print("done")
