In [1]:
import numpy as np
import pandas as pd
import os
import time
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import pickle
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
# load data
process_df = pd.read_pickle('processed.pkl')

# since we do it by store and department, store_id, cat_id, dept_id are useless
unused_features = [
    'id',
    'state_id',
    'store_id',
    'cat_id',
    'dept_id',
    'date',
    'wm_yr_wk',
    'd',
    'sales',
    'revenue'
]

# retrieve training features
used_features = process_df.columns[~process_df.columns.isin(unused_features)]

FIRST_DAY = 0
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']

def prepare_data(df, store, state):
    df = df[df['d'] >= FIRST_DAY]
    df = df[(df['store_id'] == store) & (df['dept_id'] == state)]
    return df

model_dir = './models_store_dept/'

In [7]:
predictions = []
for store in STORES:
    for state in DEPTS:
        print('starting:',store,state)

        # get test data
        pred_df = prepare_data(process_df,store,state)
        x_test = pred_df[(pred_df['date'] > '2016-04-24')]
        y_test = x_test['sales']
        
        # load model
        model_name = store+'_'+state+'.bin'
        model = pickle.load(open(os.path.join(model_dir,model_name),"rb"))
        
        # predict
        pred = model.predict(x_test[used_features])
        prediction = pd.DataFrame({'pred': pred})
        prediction.index = x_test.index.tolist()
        predictions.append(prediction)
        
        del pred_df, x_test, y_test
pred_df = pd.concat(predictions)

starting: CA_1 HOBBIES_1
starting: CA_1 HOBBIES_2
starting: CA_1 HOUSEHOLD_1
starting: CA_1 HOUSEHOLD_2
starting: CA_1 FOODS_1
starting: CA_1 FOODS_2
starting: CA_1 FOODS_3
starting: CA_2 HOBBIES_1
starting: CA_2 HOBBIES_2
starting: CA_2 HOUSEHOLD_1
starting: CA_2 HOUSEHOLD_2
starting: CA_2 FOODS_1
starting: CA_2 FOODS_2
starting: CA_2 FOODS_3
starting: CA_3 HOBBIES_1
starting: CA_3 HOBBIES_2
starting: CA_3 HOUSEHOLD_1
starting: CA_3 HOUSEHOLD_2
starting: CA_3 FOODS_1
starting: CA_3 FOODS_2
starting: CA_3 FOODS_3
starting: CA_4 HOBBIES_1
starting: CA_4 HOBBIES_2
starting: CA_4 HOUSEHOLD_1
starting: CA_4 HOUSEHOLD_2
starting: CA_4 FOODS_1
starting: CA_4 FOODS_2
starting: CA_4 FOODS_3
starting: TX_1 HOBBIES_1
starting: TX_1 HOBBIES_2
starting: TX_1 HOUSEHOLD_1
starting: TX_1 HOUSEHOLD_2
starting: TX_1 FOODS_1
starting: TX_1 FOODS_2
starting: TX_1 FOODS_3
starting: TX_2 HOBBIES_1
starting: TX_2 HOBBIES_2
starting: TX_2 HOUSEHOLD_1
starting: TX_2 HOUSEHOLD_2
starting: TX_2 FOODS_1
starting

In [9]:
# postprocess predictions
x_test = process_df[(process_df['date'] > '2016-04-24')]
x_test = pd.merge(x_test,pred_df,left_index=True,right_index=True)

validation = x_test[(x_test['d'] < 1942)]
evaluation = x_test[(x_test['d'] >= 1942)]

validation = validation.pivot(index='id',columns='d',values='pred')
validation = validation.reset_index()
validation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
validation['id'] = validation['id'].apply(lambda x: x.replace('evaluation', 'validation'))

evaluation = evaluation.pivot(index='id',columns='d',values='pred')
evaluation = evaluation.reset_index()
evaluation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

output = pd.concat([validation, evaluation]).reset_index(drop=True)
output.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,0.960049,0.941398,0.916143,0.902786,1.002268,1.016012,1.009466,0.876869,0.974318,...,1.020311,1.088799,1.07227,0.928562,0.928562,0.917426,0.919192,0.991947,0.957871,0.927896
1,FOODS_1_001_CA_2_evaluation,1.000035,1.148915,1.088142,1.067266,1.118961,1.361792,1.344284,1.239497,1.028131,...,1.155126,1.227324,1.219188,1.0454,0.912316,0.87961,0.905048,0.951888,1.058583,1.053538
2,FOODS_1_001_CA_3_evaluation,1.131685,1.129222,1.158137,1.155616,1.322831,1.445827,1.44268,1.297841,1.301602,...,1.132886,1.271607,1.388356,1.243192,1.172656,1.181681,1.179109,1.209036,1.335693,1.335693
3,FOODS_1_001_CA_4_evaluation,0.41641,0.417793,0.415622,0.415622,0.416258,0.415737,0.415888,0.414774,0.415514,...,0.414842,0.414826,0.414826,0.414468,0.414468,0.412181,0.41345,0.411786,0.411786,0.411786
4,FOODS_1_001_TX_1_evaluation,0.390817,0.390817,0.458002,0.456844,0.480519,0.484805,0.500836,0.491022,0.492315,...,0.542616,0.531306,0.531306,0.536336,0.729625,0.729093,0.752975,0.77409,0.79225,0.817328


In [10]:
output.to_csv('submission_test_6.csv', index = False)