In [1]:
import numpy as np
import pandas as pd
import os
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import pickle
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

In [6]:
# load data
process_df = pd.read_pickle('processed.pkl')

# since we do it by store and category, store_id, cat_id are useless
# dept_id is useful
unused_features = [
    'id',
    'state_id',
    'store_id',
    'cat_id',
    # 'dept_id',
    'date',
    'wm_yr_wk',
    'd',
    'sales',
    'revenue'
]

# retrieve training features
used_features = process_df.columns[~process_df.columns.isin(unused_features)]

FIRST_DAY = 0
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
CATEGORIES = ['HOBBIES', 'HOUSEHOLD', 'FOODS']

def prepare_data(df, store, category):
    df = df[df['d'] >= FIRST_DAY]
    df = df[(df['store_id'] == store) & (df['cat_id'] == category)]
    return df

model_dir = './models_store_cat/'

In [7]:
predictions = []
for store in STORES:
    for category in CATEGORIES:
        print('starting:',store,category)

        # get test data
        pred_df = prepare_data(process_df,store,category)
        x_test = pred_df[(pred_df['date'] > '2016-04-24')]
        y_test = x_test['sales']
        
        # load model
        model_name = store+'_'+category+'.bin'
        model = pickle.load(open(os.path.join(model_dir,model_name),"rb"))
        
        # predict
        pred = model.predict(x_test[used_features])
        prediction = pd.DataFrame({'pred': pred})
        prediction.index = x_test.index.tolist()
        predictions.append(prediction)
        
        del pred_df, x_test, y_test
pred_df = pd.concat(predictions)

starting: CA_1 HOBBIES
starting: CA_1 HOUSEHOLD
starting: CA_1 FOODS
starting: CA_2 HOBBIES
starting: CA_2 HOUSEHOLD
starting: CA_2 FOODS
starting: CA_3 HOBBIES
starting: CA_3 HOUSEHOLD
starting: CA_3 FOODS
starting: CA_4 HOBBIES
starting: CA_4 HOUSEHOLD
starting: CA_4 FOODS
starting: TX_1 HOBBIES
starting: TX_1 HOUSEHOLD
starting: TX_1 FOODS
starting: TX_2 HOBBIES
starting: TX_2 HOUSEHOLD
starting: TX_2 FOODS
starting: TX_3 HOBBIES
starting: TX_3 HOUSEHOLD
starting: TX_3 FOODS
starting: WI_1 HOBBIES
starting: WI_1 HOUSEHOLD
starting: WI_1 FOODS
starting: WI_2 HOBBIES
starting: WI_2 HOUSEHOLD
starting: WI_2 FOODS
starting: WI_3 HOBBIES
starting: WI_3 HOUSEHOLD
starting: WI_3 FOODS


In [9]:
# postprocess predictions
x_test = process_df[(process_df['date'] > '2016-04-24')]
x_test = pd.merge(x_test,pred_df,left_index=True,right_index=True)

validation = x_test[(x_test['d'] < 1942)]
evaluation = x_test[(x_test['d'] >= 1942)]

validation = validation.pivot(index='id',columns='d',values='pred')
validation = validation.reset_index()
validation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
validation['id'] = validation['id'].apply(lambda x: x.replace('evaluation', 'validation'))

evaluation = evaluation.pivot(index='id',columns='d',values='pred')
evaluation = evaluation.reset_index()
evaluation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

output = pd.concat([validation, evaluation]).reset_index(drop=True)
output.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.868429,0.718662,0.692986,0.660371,1.073689,0.993414,1.019892,1.192062,0.948195,...,0.870443,1.237073,1.015062,1.132092,0.918498,0.918498,0.846197,0.956245,1.161541,1.082484
1,FOODS_1_001_CA_2_validation,0.781524,0.764359,0.764359,0.731553,0.77119,0.878091,0.713753,0.689797,0.689504,...,0.718925,0.799269,2.382983,1.259099,1.399119,1.229053,1.737313,1.647683,2.02205,1.565781
2,FOODS_1_001_CA_3_validation,1.586742,1.460268,1.465666,1.408763,1.331178,1.040066,1.104111,1.000917,0.949795,...,1.14888,1.107104,2.505461,1.568891,1.38674,1.379256,1.379256,1.398266,1.737964,1.120408
3,FOODS_1_001_CA_4_validation,0.606219,0.51555,0.471472,0.471472,0.471472,0.468293,0.45293,0.464048,0.411311,...,0.255704,0.265226,0.265648,0.440197,0.35111,0.35111,0.35111,0.409133,0.498162,0.523558
4,FOODS_1_001_TX_1_validation,0.525399,0.497885,0.467689,0.467689,0.436409,0.510066,0.590989,0.465476,0.492538,...,0.404612,0.434402,0.488394,0.38006,0.358222,0.356112,0.354612,0.36214,0.431311,0.461425


In [10]:
output.to_csv('submission_test_8.csv', index = False)