In [None]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [None]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
tr_last = 1941

In [None]:
def create_dt(is_train = True, nrows = None, first_day = 1):
    prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-57, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [None]:
%%time

df = create_dt(is_train=True, first_day= 1) # 合成一整个表
df.shape

In [None]:
df.head()

In [None]:
df.info()

# 可视化

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

In [None]:
# 每一行算一个销售额
df['sales_volume'] = df['sales'] * df['sell_price']
df

In [None]:
# id：某个商店的某个物品
df_id=df[df['id'] == "HOBBIES_1_008_CA_1_evaluation"]
# df_id
df_id.plot(x='d', y='sales_volume',kind='line',figsize=(30,15))

In [None]:
# item_id：某个物品
df_item_id=df[df['item_id'] == 107].groupby('d').sum()

df_item_id['d_index'] = df_item_id.index
df_item_id['d_index'] = df_item_id['d_index'].apply(str)

num=[]
for i in np.arange(0,df_item_id.shape[0]):
    num.append(int(df_item_id['d_index'][i][2:]))

df_item_id['num'] = num
df_item_id.sort_values(by='num',ascending=True,inplace=True)

df_item_id.plot(y='sales_volume',kind='line',figsize=(30,15))

In [None]:
# dept_id：某个商品部门
df_dept_id=df[df['dept_id'] == 1].groupby('d').sum()

df_dept_id['d_index'] = df_dept_id.index
df_dept_id['d_index'] = df_dept_id['d_index'].apply(str)

num=[]
for i in np.arange(0,df_dept_id.shape[0]):
    num.append(int(df_dept_id['d_index'][i][2:]))

df_dept_id['num'] = num
df_dept_id.sort_values(by='num',ascending=True,inplace=True)

df_dept_id.plot(y='sales_volume',kind='line',figsize=(30,15))

In [None]:
# store_id：某个商店
df_store_id=df[df['store_id'] == 1].groupby('d').sum()

df_store_id['d_index'] = df_store_id.index
df_store_id['d_index'] = df_store_id['d_index'].apply(str)

num=[]
for i in np.arange(0,df_store_id.shape[0]):
    num.append(int(df_store_id['d_index'][i][2:]))

df_store_id['num'] = num
df_store_id.sort_values(by='num',ascending=True,inplace=True)

df_store_id.plot(y='sales_volume',kind='line',figsize=(30,15))

In [None]:
# cat_id：某个商品类别
def cat(cat_id):
    df_cat_id=df[df['cat_id'] == cat_id].groupby('d').sum()

    df_cat_id['d_index'] = df_cat_id.index
    df_cat_id['d_index'] = df_cat_id['d_index'].apply(str)

    num=[]
    for i in np.arange(0,df_cat_id.shape[0]):
        num.append(int(df_cat_id['d_index'][i][2:]))

    df_cat_id['num'] = num
    df_cat_id.sort_values(by='num',ascending=True,inplace=True)
    
    return df_cat_id[::28]

df_cat_id_0=cat(0)
df_cat_id_1=cat(1)
df_cat_id_2=cat(2)
ax = df_cat_id_0.plot(y='sales_volume',kind='line',figsize=(30,15),color='r')
df_cat_id_1.plot(ax=ax,y='sales_volume',kind='line',figsize=(30,15),color='g')
df_cat_id_2.plot(ax=ax,y='sales_volume',kind='line',figsize=(30,15),color='b')
plt.ylim(0,150000)

In [None]:
# state_id：某个洲
def state(state_id):
    df_state_id=df[df['state_id'] == state_id].groupby('d').sum()

    df_state_id['d_index'] = df_state_id.index
    df_state_id['d_index'] = df_state_id['d_index'].apply(str)

    num=[]
    for i in np.arange(0,df_state_id.shape[0]):
        num.append(int(df_state_id['d_index'][i][2:]))

    df_state_id['num'] = num
    df_state_id.sort_values(by='num',ascending=True,inplace=True)

    return df_state_id[::28]

df_state_id_0=state(0)
df_state_id_1=state(1)
df_state_id_2=state(2)
ax = df_state_id_0.plot(y='sales_volume',kind='line',figsize=(30,15),color='r')
df_state_id_1.plot(ax=ax,y='sales_volume',kind='line',figsize=(30,15),color='g')
df_state_id_2.plot(ax=ax,y='sales_volume',kind='line',figsize=(30,15),color='b')
plt.ylim(0,100000)

In [None]:
# 总销量
df_all=df.groupby('d').sum()

df_all['d_index'] = df_all.index
df_all['d_index'] = df_all['d_index'].apply(str)

num=[]
for i in np.arange(0,df_all.shape[0]):
    num.append(int(df_all['d_index'][i][2:]))

df_all['num'] = num
df_all.sort_values(by='num',ascending=True,inplace=True)
df_all = df_all[::28]

df_all.plot(y='sales_volume',kind='line',figsize=(30,15))

In [None]:
#change weekdays in regular order
def f(row):
    if (row+5)%7==0:
        return 7
    return (row+5)%7

In [None]:
df['wday'] = df['wday'].apply(f)
df

In [None]:
A = df.groupby("date")["sales"].sum()

In [None]:
#total sales with trend
import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
lowess = sm.nonparametric.lowess
yest = lowess(A, A.index, frac=1./3.)[:,1]
fig = plt.figure(figsize=(14, 8))
plt.plot(A.index,A)
plt.plot(A.index,yest,color = 'orange')
plt.title("Total sales")
plt.savefig('Total_with_trend.jpg')
plt.show()

In [None]:
import seaborn as sns
sns.set_context({"figure.figsize":(12,8)})

In [None]:
values = []
for i in range(1,8):
    df_tmp = df[df['wday']==i]
    B =  df_tmp.groupby("month")['sales'].sum()
    values.append(B.to_list())

In [None]:
#heatmap building
x_ticks = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct','Nov','Dec']
y_ticks = ['Mon', 'Tue', 'Wed','Thur','Fri','Sat','Sun']  # 自定义横纵轴
ax = sns.heatmap([[i/1000 for i in values[j]] for j in range(len(values))], cmap="YlGnBu",xticklabels=x_ticks, yticklabels=y_ticks)
ax.set_title('Heatmap of sales')  # 图标题
ax.set_xlabel('months')  # x轴标题
ax.set_ylabel('weekdays')
plt.savefig('heatmap.jpg')
plt.show()


In [None]:
for i in range(10):
    df_tmp = df[df['store_id']==i]['id']
    print(i,end = " ")
    print(df_tmp.head)

In [None]:
#0-3 CA
#4-6 TX
#7-9 WI

In [None]:
#state sales with trend
from sklearn import preprocessing
dict = [[0,1,2,3],[4,5,6],[7,8,9]]
df_CA = df[df['store_id']<4]
C = df_CA.groupby('date')['sales'].sum()
yest1 = lowess(C, C.index, frac=1./3.)[:,1]
fig = plt.figure(figsize=(14, 8))
plt.plot(C.index,C,color = 'green')
plt.plot(C.index,yest1,color = 'black')
plt.title("CA sales")
plt.savefig('CA_with_trend.jpg')
plt.show()


    #break
    

In [None]:
df_TX = df[(df['store_id']>=4)&(df['store_id']<7)]
T = df_TX.groupby('date')['sales'].sum()
yest1 = lowess(T, T.index, frac=1./3.)[:,1]
fig = plt.figure(figsize=(14, 8))
plt.plot(T.index,T,color = 'blue')
plt.plot(T.index,yest1,color = 'black')
plt.title("TX sales")
plt.savefig('TX_with_trend.jpg')
plt.show()

In [None]:
df_WI = df[(df['store_id']>=7)]
W = df_WI.groupby('date')['sales'].sum()
yest1 = lowess(W, W.index, frac=1./3.)[:,1]
fig = plt.figure(figsize=(14, 8))
plt.plot(W.index,W,color = 'grey')
plt.plot(W.index,yest1,color = 'black')
plt.title("WI sales")
plt.savefig('WI_with_trend.jpg')
plt.show()

In [None]:
#sales go with weekdays
from sklearn import preprocessing

CA_W = preprocessing.scale(df_CA.groupby('wday')['sales'].sum())
TX_W = preprocessing.scale(df_TX.groupby('wday')['sales'].sum())
WI_W = preprocessing.scale(df_WI.groupby('wday')['sales'].sum())
x_axis = ['Mon', 'Tue', 'Wed','Thur','Fri','Sat','Sun']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_W,color = 'green',label = 'CA')
plt.plot(x_axis,TX_W,color = 'blue', label = 'TX')
plt.plot(x_axis,WI_W,color = 'grey', label = 'WI')
plt.title("Total Sales by Weekdays")
plt.legend()
plt.savefig('Total_by_wday.jpg')
plt.show()

In [None]:
#sales go with months the followings are similar
CA_M = preprocessing.scale(df_CA.groupby('month')['sales'].sum())
TX_M = preprocessing.scale(df_TX.groupby('month')['sales'].sum())
WI_M = preprocessing.scale(df_WI.groupby('month')['sales'].sum())
x_axis = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct','Nov','Dec']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_M,color = 'green',label = 'CA')
plt.plot(x_axis,TX_M,color = 'blue',label = 'TX')
plt.plot(x_axis,WI_M,color = 'grey',label = 'WI')
plt.title("Total Sales by Months")
plt.legend()
plt.savefig('Total_by_mon.jpg')
plt.show()

In [None]:
CA_W = preprocessing.scale(df_CA[df_CA['cat_id']==0].groupby('wday')['sales'].sum())
TX_W = preprocessing.scale(df_TX[df_TX['cat_id']==0].groupby('wday')['sales'].sum())
WI_W = preprocessing.scale(df_WI[df_WI['cat_id']==0].groupby('wday')['sales'].sum())
x_axis = ['Mon', 'Tue', 'Wed','Thur','Fri','Sat','Sun']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_W,color = 'green',label = 'CA')
plt.plot(x_axis,TX_W,color = 'blue', label = 'TX')
plt.plot(x_axis,WI_W,color = 'grey', label = 'WI')
plt.title("Hobbies Sales by Weekdays")
plt.legend()
plt.savefig('Hobbies_by_wday.jpg')
plt.show()

In [None]:
CA_W = preprocessing.scale(df_CA[df_CA['cat_id']==1].groupby('wday')['sales'].sum())
TX_W = preprocessing.scale(df_TX[df_TX['cat_id']==1].groupby('wday')['sales'].sum())
WI_W = preprocessing.scale(df_WI[df_WI['cat_id']==1].groupby('wday')['sales'].sum())
x_axis = ['Mon', 'Tue', 'Wed','Thur','Fri','Sat','Sun']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_W,color = 'green',label = 'CA')
plt.plot(x_axis,TX_W,color = 'blue', label = 'TX')
plt.plot(x_axis,WI_W,color = 'grey', label = 'WI')
plt.title("Household Sales by Weekdays")
plt.legend()
plt.savefig('Household_by_wday.jpg')
plt.show()

In [None]:
CA_W = preprocessing.scale(df_CA[df_CA['cat_id']==2].groupby('wday')['sales'].sum())
TX_W = preprocessing.scale(df_TX[df_TX['cat_id']==2].groupby('wday')['sales'].sum())
WI_W = preprocessing.scale(df_WI[df_WI['cat_id']==2].groupby('wday')['sales'].sum())
x_axis = ['Mon', 'Tue', 'Wed','Thur','Fri','Sat','Sun']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_W,color = 'green',label = 'CA')
plt.plot(x_axis,TX_W,color = 'blue', label = 'TX')
plt.plot(x_axis,WI_W,color = 'grey', label = 'WI')
plt.title("Foods Sales by Weekdays")
plt.legend()
plt.savefig('Foods_by_wday.jpg')
plt.show()

In [None]:
CA_M = preprocessing.scale(df_CA[df_CA['cat_id']==0].groupby('month')['sales'].sum())
TX_M = preprocessing.scale(df_TX[df_TX['cat_id']==0].groupby('month')['sales'].sum())
WI_M = preprocessing.scale(df_WI[df_WI['cat_id']==0].groupby('month')['sales'].sum())
x_axis = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct','Nov','Dec']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_M,color = 'green',label = 'CA')
plt.plot(x_axis,TX_M,color = 'blue',label = 'TX')
plt.plot(x_axis,WI_M,color = 'grey',label = 'WI')
plt.title("Hobbies Sales by Months")
plt.legend()
plt.savefig('Hobbies_by_month.jpg')
plt.show()

In [None]:
CA_M = preprocessing.scale(df_CA[df_CA['cat_id']==1].groupby('month')['sales'].sum())
TX_M = preprocessing.scale(df_TX[df_TX['cat_id']==1].groupby('month')['sales'].sum())
WI_M = preprocessing.scale(df_WI[df_WI['cat_id']==1].groupby('month')['sales'].sum())
x_axis = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct','Nov','Dec']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_M,color = 'green',label = 'CA')
plt.plot(x_axis,TX_M,color = 'blue',label = 'TX')
plt.plot(x_axis,WI_M,color = 'grey',label = 'WI')
plt.title("Household Sales by Months")
plt.legend()
plt.savefig('Household_by_month.jpg')
plt.show()

In [None]:
CA_M = preprocessing.scale(df_CA[df_CA['cat_id']==2].groupby('month')['sales'].sum())
TX_M = preprocessing.scale(df_TX[df_TX['cat_id']==2].groupby('month')['sales'].sum())
WI_M = preprocessing.scale(df_WI[df_WI['cat_id']==2].groupby('month')['sales'].sum())
x_axis = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug','Sep','Oct','Nov','Dec']
fig = plt.figure(figsize=(14, 8))
plt.plot(x_axis,CA_M,color = 'green',label = 'CA')
plt.plot(x_axis,TX_M,color = 'blue',label = 'TX')
plt.plot(x_axis,WI_M,color = 'grey',label = 'WI')
plt.title("Foods Sales by Months")
plt.legend()
plt.savefig('Foods_by_month.jpg')
plt.show()

In [None]:
# state_id: 0:CA, 1:TX, 2:WI
# cat_id: 0:hobbies, 1:HOUSEHOLD  2:FOODS
# event to different kinds of category sales
# event to different state sales
# 对每个类别每天的销售加和汇总
# 对每个州每天的销售加和汇总

df_event_1 = df[(df["event_name_1"]==0) & (df["event_name_2"]==0)][["id", "state_id", "cat_id", "date", "sales"]]
df_event_2 = df[(df["event_name_1"]!=0) | (df["event_name_2"]!=0)][["id", "state_id", "cat_id", "date", "sales"]]
df_event_cat = df_event_1.groupby(["cat_id","date"])["sales"].mean()
df_event_state = df_event_1.groupby(["state_id","date"])["sales"].mean()
df_event_cat_withevent = df_event_2.groupby(["cat_id","date"])["sales"].mean()
df_event_state_withevent = df_event_2.groupby(["state_id","date"])["sales"].mean()

# SNAP to different state sales
df_CA = df[df["state_id"]==0][["id", "state_id", "date", "sales", "snap_CA"]].groupby(["state_id","date","snap_CA"])["sales"].mean()
df_TX = df[df["state_id"]==1][["id", "state_id", "date", "sales", "snap_TX"]].groupby(["state_id","date","snap_TX"])["sales"].mean()
df_WI = df[df["state_id"]==2][["id", "state_id", "date", "sales", "snap_WI"]].groupby(["state_id","date","snap_WI"])["sales"].mean()
df_CA["snap_CA"] = df_CA["snap_CA"].astype(int)
df_TX["snap_TX"] = df_TX["snap_TX"].astype(int)
df_WI["snap_WI"] = df_WI["snap_WI"].astype(int)

df_cat_snap_0 = df[df["cat_id"] == 0][["date","sales","snap_CA","snap_TX","snap_WI"]]
df_cat_snap_1 = df[df["cat_id"] == 1][["date","sales","snap_CA","snap_TX","snap_WI"]]
df_cat_snap_2 = df[df["cat_id"] == 2][["date","sales","snap_CA","snap_TX","snap_WI"]]
df_cat_snap_0["is_snap_day"] = df_cat_snap_0["snap_CA"]+df_cat_snap_0["snap_TX"]+df_cat_snap_0["snap_WI"]
df_cat_snap_1["is_snap_day"] = df_cat_snap_1["snap_CA"]+df_cat_snap_1["snap_TX"]+df_cat_snap_1["snap_WI"]
df_cat_snap_2["is_snap_day"] = df_cat_snap_2["snap_CA"]+df_cat_snap_2["snap_TX"]+df_cat_snap_2["snap_WI"]
df_cat_snap_0 = df_cat_snap_0.groupby(["date"])["sales","is_snap_day"].mean()
df_cat_snap_1 = df_cat_snap_1.groupby(["date"])["sales","is_snap_day"].mean()
df_cat_snap_2 = df_cat_snap_2.groupby(["date"])["sales","is_snap_day"].mean()

In [None]:
df_event_cat_0 = df_event_cat
df_event_cat_1 = df_event_cat_withevent
df_event_state_0 = df_event_sate
df_event_state_1 = df_event_sate_withevent
df_event_cat_0["is_event_day"] = 0
df_event_state_0["is_event_day"] = 0
df_event_cat_1["is_event_day"] = 1
df_event_state_1["is_event_day"] = 1

df_event_cat_0["date"] = pd.to_datetime(df_event_cat_0["date"])
df_event_cat_1["date"] = pd.to_datetime(df_event_cat_1["date"])
df_event_state_1["date"] = pd.to_datetime(df_event_state_1["date"])
df_event_state_0["date"] = pd.to_datetime(df_event_state_0["date"])
df_event_cat = df_event_cat_0.append(df_event_cat_1).reset_index(drop = True)
df_event_state = df_event_state_0.append(df_event_state_1).reset_index(drop = True)

df_event_cat.sort_values(by = "date")
df_event_state.sort_values(by = "date")

df_event_cat_all_0 = df_event_cat[df_event_cat["cat_id"]==0]
df_event_cat_all_1 = df_event_cat[df_event_cat["cat_id"]==1]
df_event_cat_all_2 = df_event_cat[df_event_cat["cat_id"]==2]
df_event_state_all_0 = df_event_state[df_event_state["state_id"]==0]
df_event_state_all_1 = df_event_state[df_event_state["state_id"]==1]
df_event_state_all_2 = df_event_state[df_event_state["state_id"]==2]

fig = px.line(df_event_cat_all_0, x='date', y="sales", color = "is_event_day", title = "HOBBIES")
fig.show()
fig = px.line(df_event_cat_all_1, x='date', y="sales", color = "is_event_day", title = "HOUSEHOLD")
fig.show()
fig = px.line(df_event_cat_all_2, x='date', y="sales", color = "is_event_day", title = "FOODS")
fig.show()

fig = px.line(df_event_state_all_0, x='date', y="sales", color = "is_event_day", title = "CA")
fig.show()
fig = px.line(df_event_state_all_1, x='date', y="sales", color = "is_event_day", title = "TX")
fig.show()
fig = px.line(df_event_state_all_2, x='date', y="sales", color = "is_event_day", title = "WI")
fig.show()

In [None]:
df_snap_cat0 = df_cat_snap_0
df_snap_cat1 = df_cat_snap_1
df_snap_cat2 = df_cat_snap_2

df_snap_cat0["date"] = pd.to_datetime(df_snap_cat0["date"])
df_snap_cat1["date"] = pd.to_datetime(df_snap_cat1["date"])
df_snap_cat2["date"] = pd.to_datetime(df_snap_cat2["date"])
df_snap_cat0.sort_values("date")
df_snap_cat1.sort_values("date")
df_snap_cat2.sort_values("date")

fig = px.line(df_CA, x='date', y="sales", color = "snap_CA", title = "CA")
fig.show()
fig = px.line(df_TX, x='date', y="sales", color = "snap_TX", title = "TX")
fig.show()
fig = px.line(df_WI, x='date', y="sales", color = "snap_WI", title = "WI")
fig.show()
fig = px.line(df_snap_cat0, x='date', y="sales", color = "is_snap_day1", title = "HOBBIES")
fig.show()
fig = px.line(df_snap_cat1, x='date', y="sales", color = "is_snap_day1", title = "HOUSEHOLD")
fig.show()
fig = px.line(df_snap_cat2, x='date', y="sales", color = "is_snap_day1", title = "FOODS")
fig.show()

# 训练

In [None]:
df.dropna(inplace = True)
df.shape

In [None]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [None]:
%%time

np.random.seed(5013)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)

In [None]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

In [None]:
params = {
    "objective" : "poisson",
    "metric" :"rmse",
    "force_row_wise" : True,
    "learning_rate" : 0.075,
    "sub_row" : 0.75,
    "bagging_freq" : 1,
    "lambda_l2" : 0.1,
    "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1500,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [None]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20)

m_lgb.save_model("model.lgb") 

# 测试

In [None]:
%%time

fday = datetime(2016,4, 25) + timedelta(days= 28)
alphas = [1]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=57)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) 

    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_evaluation.csv", usecols = ["id"]+ [f"d_{i}" for i in range(1914, 1914+28)])
sub2.rename(columns = {f"d_{i}": f'F{i-1913}' for i in range(1914, 1914+28)}, inplace=True)
sub2["id"] = sub2["id"].str.replace("evaluation", "validation")

sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)
print(sub.shape)

In [None]:
sub.head(10)