In [2]:
import pandas as pd
import numpy as np
import subprocess, time, random, gc

from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping
from sklearn.preprocessing import LabelEncoder

def get_grid_name(grid):
    name =[x for x in globals() if globals()[x] is grid][0]
    return name

def reduce_mem_usage(grid, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = grid.memory_usage().sum() / 1024**2    
    for col in grid.columns:
        col_type = grid[col].dtypes
        if col_type in numerics:
            c_min = grid[col].min()
            c_max = grid[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    grid[col] = grid[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    grid[col] = grid[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    grid[col] = grid[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    grid[col] = grid[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    grid[col] = grid[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    grid[col] = grid[col].astype(np.float32)
                else:
                    grid[col] = grid[col].astype(np.float64)    
    end_mem = grid.memory_usage().sum() / 1024**2
    if verbose: 
        print(' Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return grid

def submit_to_kaggle(competition_name, submission_file, message):
    kaggle_path = "/root/miniconda3/envs/lightgbm/bin/kaggle"
    subprocess.run([kaggle_path, "competitions", "submit", "-c", competition_name, "-f", submission_file, "-m", message])

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'tweedie',
    'tweedie_variance_power': 1.1,
    'metric': 'rmse',
    'subsample': 0.5,
    'device_type': 'cpu',
    'subsample_freq': 1,
    'min_child_weight': 1,
    'learning_rate': 0.03,
    'num_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
    'feature_fraction': 0.5,
    'max_bin': 100,
    'n_estimators': 1400, #1400
    'boost_from_average': False,
    'verbosity': -1
    }

In [7]:
# read the data
d_cols = [f'd_{i}' for i in range(1, 1942)]
d_types = {col: np.int16 for col in d_cols}
grid_dtypes = {
    'id': 'str',
    'item_id': 'str',
    'dept_id': 'str',
    'cat_id': 'str',
    'store_id': 'str',
    'state_id': 'str'
}
grid_dtypes.update(d_types)
calendar = pd.read_csv('data/calendar.csv')
calendar[['snap_CA','snap_TX','snap_WI']] = calendar[['snap_CA','snap_TX','snap_WI']].astype(bool)
sell_prices = pd.read_csv('data/sell_prices.csv')
submission = pd.read_csv('data/sample_submission.csv')
grid = pd.read_csv('data/sales_train_evaluation.csv', dtype=grid_dtypes)
grid = reduce_mem_usage(grid)

##### REMOVE RANDOM ID ONCE HAPPY ########
""" rnd_id = (random.sample(list(grid['id'].unique()), 1)[0]) # FOODS_1_058_WI_2_evaluation
grid = grid[grid['id'] == rnd_id] """

sub_cols = ['id'] + [f'd_{i}' for i in range(1942, 1970)]
submission.columns = sub_cols
training_days = [f'd_{i}' for i in range(1200, 1942)]
req_cols = grid.columns[:6]
cols_to_keep = req_cols.tolist() + training_days

grid = grid[cols_to_keep]
grid = grid.join(submission.set_index('id'), on='id')
grid = grid.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                var_name='d', value_name='sales')
grid = grid.merge(calendar.drop(['weekday','year','wday','month'], axis=1), on = ['d'], how = 'left')
grid = grid.join(sell_prices.set_index(['store_id','item_id','wm_yr_wk']), on=['store_id', 'item_id', 'wm_yr_wk'])
grid['sell_price'] = grid['sell_price'].astype(np.float32)
print(grid['sell_price'].isna().sum())
grid['sell_price'] = grid.groupby(['id'])['sell_price'].ffill()
grid['sell_price'] = grid['sell_price'].fillna(-1)

le = LabelEncoder()
cat_vars = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 
            'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

for var in cat_vars:
    grid[var] = le.fit_transform(grid[var])
    grid[var] = grid[var].astype('int16')

grid['d'] = grid['d'].str.replace('d_', '').astype('int16')    

#print(grid[grid['id'] == 'HOUSEHOLD_2_467_WI_3_evaluation'][['id', 'd', 'sales']].sort_values(by=['d']).tail(50))
grid['date'] = grid['date'].astype('datetime64[ns]')
grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
grid['tm_w'] = grid['date'].dt.weekday.astype(np.int8)
grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
grid['tm_y'] = grid['date'].dt.year
grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)
grid = grid.drop(['wm_yr_wk', 'date'], axis=1)
grid = grid.sort_values(by=['id','d'], ascending=[True,True])
grid = reduce_mem_usage(grid)
grid.to_pickle('data/grid_no_features.pkl')

 Mem. usage decreased to 96.13 Mb (15.9% reduction)
479059
 Mem. usage decreased to 917.98 Mb (47.4% reduction)


In [3]:
eva = pd.read_csv('data/sales_train_evaluation.csv')
eva = eva.iloc[:,np.r_[0:6, 1200:len(eva.columns)]] #
eva = reduce_mem_usage(eva)
price = pd.read_csv('data/sell_prices.csv')
calendar = pd.read_csv('data/calendar.csv')

index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
TARGET = 'sales'         # Our main target
END_TRAIN = 1913+28      # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

grid = pd.melt(eva, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

print('Train rows:', len(eva), len(grid))

add_grid = pd.DataFrame()
for i in range(1,29):
    temp_df = eva[index_columns]
    temp_df = temp_df.drop_duplicates()
    temp_df['d'] = 'd_'+ str(END_TRAIN+i)
    temp_df[TARGET] = np.nan
    add_grid = pd.concat([add_grid,temp_df])

del eva
grid = pd.concat([grid,add_grid])
del add_grid
grid = grid.reset_index(drop=True)
grid = grid.merge(calendar.drop(['weekday','year','wday','month'], axis=1), on = ['d'], how = 'left')
grid = grid.merge(price, on = ['store_id','item_id','wm_yr_wk'], how = 'left')
print(grid['sell_price'].isna().sum())
grid['date'] = grid['date'].astype('datetime64[ns]')
grid['tm_d'] = grid['date'].dt.day.astype(np.int8)
grid['tm_w'] = grid['date'].dt.weekday.astype(np.int8)
grid['tm_m'] = grid['date'].dt.month.astype(np.int8)
grid['tm_y'] = grid['date'].dt.year
grid['tm_y'] = (grid['tm_y'] - grid['tm_y'].min()).astype(np.int8)
grid['tm_dw'] = grid['date'].dt.dayofweek.astype(np.int8)
grid['tm_w_end'] = (grid['tm_dw'] >= 5).astype(np.int8)
grid['d'] = grid['d'].str.replace('d_', '').astype('int16')
grid = grid.drop(['wm_yr_wk', 'date'], axis=1)
cat_vars = ['item_id','dept_id','cat_id','store_id','state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
grid[cat_vars] = grid[cat_vars].astype(str)
le = LabelEncoder()

for cat in cat_vars:
    grid[cat] = le.fit_transform(grid[cat])
grid = reduce_mem_usage(grid)
grid.to_pickle('data/base.pkl')

 Mem. usage decreased to 34.60 Mb (80.2% reduction)
Train rows: 30490 22776030
492042
 Mem. usage decreased to 743.66 Mb (74.2% reduction)


In [4]:
#grid = pd.read_pickle('data/grid_no_features.pkl')
grid = pd.read_pickle('data/base.pkl')

print('************ ROLLING MEANS ************')
grp = grid.groupby(['id'], group_keys=False)['sales']
for roll in [7,14,30,60,180]:
    grid['rm_' + str(roll)] = grp.apply(lambda x: x.rolling(roll).mean())
   
print('************ ROLLING STATS ************')
for roll in [7,14,30,60,180]:
    grid['max_' + str(roll)] = grp.apply(lambda x: x.rolling(roll).max())
    grid['std_' + str(roll)] = grp.apply(lambda x: x.rolling(roll).std())

print('************ DIFF MEANS ************')
for l in [7,56,140]:
    grid['diff_rm_' + str(l)] = grp.apply(lambda x : x.diff().rolling(l).mean()) 

grp = grid.groupby(['id'], group_keys=False)['sales']
print('************ LAGS ************')
for lag in [0,1,2,3,4,5,6]:
    grid['lag_' + str(lag)] = grp.apply(lambda x: x.shift(lag))

print('************ ROLLING ZEROS ************')
for roll in [7,56,140]:
    grid['is_zero'] = [1 if sales == 0 else 0 for sales in grid['sales']]
    grp = grid.groupby(['id'], group_keys=False)['is_zero']
    grid['rolling_zero_' + str(roll)] = grp.apply(lambda x : x.rolling(roll).sum())
    grid = grid.drop('is_zero', axis = 1)   

grid = reduce_mem_usage(grid)
grid['sell_price'] = grid['sell_price'].astype(np.float32)
print('************ ROLLING PRICES ************')
# If we remove expanding all row values for a given id become the same
grp = grid.groupby(['id'], group_keys=False)['sell_price']
grid['price_max'] = grp.apply(lambda x : x.expanding().max())
grid['price_min'] = grp.apply(lambda x : x.expanding().min())
grid['price_std'] = grp.apply(lambda x : x.expanding().std())
grid['price_mean'] = grp.apply(lambda x : x.expanding().mean())
grid['price_norm'] = grid['sell_price']/grid['price_max']
grid['price_nunique'] = grid.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
grid['item_nunique'] = grid.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

na_cols=grid.isna().sum()
print(na_cols[na_cols>0])
grid = reduce_mem_usage(grid)
#grid.to_pickle('data/grid_features.pkl')
grid.to_pickle('data/feats.pkl')

************ ROLLING MEANS ************
************ ROLLING STATS ************
************ DIFF MEANS ************
************ LAGS ************
************ ROLLING ZEROS ************
 Mem. usage decreased to 2005.62 Mb (58.6% reduction)
************ ROLLING PRICES ************
sales                853720
sell_price           492042
rm_7                1036660
rm_14               1250090
rm_30               1737930
rm_60               2652630
rm_180              6311430
max_7               1036660
std_7               1036660
max_14              1250090
std_14              1250090
max_30              1737930
std_30              1737930
max_60              2652630
std_60              2652630
max_180             6311430
std_180             6311430
diff_rm_7           1067150
diff_rm_56          2561160
diff_rm_140         5122320
lag_0                853720
lag_1                853720
lag_2                853720
lag_3                853720
lag_4                853720
lag_5            

In [3]:
if __name__ == "__main__":
      
       horizon = 28
       grid = pd.read_pickle('data/feats.pkl')
       print(grid.shape)
       STEPS = [7,14,21,28] # Training/Prediction every 7 days (Compromise between very granular and long time horizon)
       TARGET = ['sales']
       VAL_DAYS, TEST_DAYS = STEPS[0], STEPS[0]
       STORES = grid.store_id.unique()
       DEPTS = grid.dept_id.unique()

       train_start = grid.d.min()
       train_end = 1941 - horizon
       first_val_day = train_end + 1
       last_val_day = 1941
       first_pred_day = 1941 + 1

       predictions = pd.DataFrame()
       remove_colums = ['id', 'store_id', 'state_id', 'd', 'sales']
       train_columns = grid.columns[~grid.columns.isin(remove_colums)]
       lags_columns = ['rm_7', 'rm_14', 'rm_30', 'rm_60',
       'rm_180', 'max_7', 'std_7', 'max_14', 'std_14', 'max_30', 'std_30',
       'max_60', 'std_60', 'max_180', 'std_180', 'diff_rm_7', 'diff_rm_56',
       'diff_rm_140', 'lag_0', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
       'lag_6', 'rolling_zero_7', 'rolling_zero_56', 'rolling_zero_140']

       print(f'train columns: {train_columns}')
       print(grid.shape)

       for store in STORES:
              print(f'************ Training Store {store+1} ************')
              #for dept in DEPTS:
              for step in STEPS:

                     grid = pd.read_pickle('data/feats.pkl')
                     grid = grid[(grid['store_id'] == store)]# & (grid['dept_id'] == dept)]
                     grid[lags_columns] = grid.groupby(['id'], observed=False)[lags_columns].shift(step)
                     mask = (grid['d'] <= 1941) & np.isnan(grid['sales'])
                     grid = grid.loc[~mask]

                     val_start = first_val_day + step - VAL_DAYS
                     val_end = first_val_day + step - 1
                     pred_start = first_pred_day + step - VAL_DAYS 
                     pred_end = first_pred_day + step - 1
                     print(f'Val start: {val_start}. Val end {val_end}. Test start {pred_start} Pred end {pred_end}')

                     trainX = grid[(grid['d'] <= train_end)][train_columns]
                     trainY = grid[(grid['d'] <= train_end)][TARGET]
                     valX = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][train_columns]
                     valY = grid[(grid['d'] >= val_start) & (grid['d'] <= val_end)][TARGET]
                     testX = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][train_columns]
                     print(f'Train shape: {trainX.shape}. Val shape: {valX.shape}. Test shape: {testX.shape}')
                     # rnd_id = (random.sample(list(grid['id'].unique()), 1)[0])
                     # print(grid[(grid['id'] == rnd_id) & (grid['d'] <= 1969)][['id', 'd', 'sales', 'lag_0', 'lag_1', 'lag_3', 'rm_7', 'rm_14']].sort_values(by=['d']).tail(35))

                     # Train
                     lgbm = LGBMRegressor(**lgb_params)
                     callbacks = [early_stopping(stopping_rounds=50, first_metric_only=False)]

                     lgbm.fit(trainX, trainY,
                            eval_set=[(valX, valY)],
                            eval_metric='rmse',
                            callbacks=callbacks)

                     # Predict
                     yhat = lgbm.predict(testX, num_iteration=lgbm.best_iteration_)
                     preds = grid[(grid['d'] >= pred_start) & (grid['d'] <= pred_end)][['id', 'd']]
                     preds['sales'] = yhat
                     predictions = pd.concat([predictions, preds], axis=0)

                     """gain_importances = lgbm.booster_.feature_importance(importance_type='gain')
                     sorted_indices = gain_importances.argsort()[::-1]
                     for index in sorted_indices:
                            print(f"{trainX.columns[index]}: {gain_importances[index]}") """

       # Submission
       predictions.to_pickle(f'submissions/store_dpt_4days.pkl')
       submission = pd.read_csv('data/sample_submission.csv')
       predictions = predictions.pivot(index='id', columns='d', values='sales').reset_index()
       predictions.columns = submission.columns
       predictions = submission[['id']].merge(predictions, on='id', how='left').fillna(1)
       submission_file = "submissions/submission.csv"
       predictions.to_csv(f'{submission_file}', index=False)
       message = "Automated submission"
       competition_name = "m5-forecasting-accuracy"
       submit_to_kaggle(competition_name, submission_file, message)

(23629750, 57)
train columns: Index(['item_id', 'dept_id', 'cat_id', 'event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
       'sell_price', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_dw', 'tm_w_end',
       'rm_7', 'rm_14', 'rm_30', 'rm_60', 'rm_180', 'max_7', 'std_7', 'max_14',
       'std_14', 'max_30', 'std_30', 'max_60', 'std_60', 'max_180', 'std_180',
       'diff_rm_7', 'diff_rm_56', 'diff_rm_140', 'lag_0', 'lag_1', 'lag_2',
       'lag_3', 'lag_4', 'lag_5', 'lag_6', 'rolling_zero_7', 'rolling_zero_56',
       'rolling_zero_140', 'price_max', 'price_min', 'price_std', 'price_mean',
       'price_norm', 'price_nunique', 'item_nunique'],
      dtype='object')
(23629750, 57)
************ Training Store 1 ************
Val start: 1914. Val end 1920. Test start 1942 Pred end 1948
Train shape: (2192231, 52). Val shape: (21343, 52). Test shape: (21343, 52)
Training until validation scores don't improve for 50 rounds
Early stopping, best iter

Traceback (most recent call last):
  File "/root/miniconda3/envs/lightgbm/lib/python3.10/site-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/root/miniconda3/envs/lightgbm/lib/python3.10/site-packages/urllib3/util/connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/root/miniconda3/envs/lightgbm/lib/python3.10/socket.py", line 955, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -3] Temporary failure in name resolution

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/root/miniconda3/envs/lightgbm/lib/python3.10/site-packages/urllib3/connectionpool.py", line 790, in urlopen
    response = self._make_request(
  File "/root/miniconda3/envs/lightgbm/lib/python3.10/site-packages/urllib3/connectionpool.py", line 491, in _make_re

In [5]:
rvw_grid = pd.read_pickle('data/grid_features.pkl')
rvw_cols = ['id','d','sales','snap_CA', 'snap_TX', 'snap_WI','lag_0','lag_1','lag_2','rm_7','rm_14']
rvw_grid[rvw_grid['id']=='FOODS_1_002_WI_3_evaluation'][rvw_cols].tail(28+14)
rvw_grid.dtypes

id               object
item_id           int16
dept_id            int8
cat_id             int8
store_id           int8
state_id           int8
d                 int16
sales             int16
event_name_1       int8
event_type_1       int8
event_name_2       int8
event_type_2       int8
snap_CA            int8
snap_TX            int8
snap_WI            int8
sell_price      float16
tm_d               int8
tm_w               int8
tm_m               int8
tm_y               int8
tm_dw              int8
tm_w_end           int8
rm_7            float16
rm_14           float16
rm_30           float16
rm_60           float16
rm_180          float16
lag_0             int16
lag_1           float16
lag_2           float16
lag_3           float16
lag_4           float16
lag_5           float16
lag_6           float16
dtype: object

### Things to try
1. Try to use -1 for NAs
2. Do nothing with NAs
3. Leave lags+rms and try more steps --> 2 steps > 1 step > 7 steps (4steps missing)
4. Leave lags+rms and try store+dept --> almost negligible improvement (bug still exists)
5. Get more data --> 0.2 improvement (bug still there)
6. Remove categorical variables ()