In [None]:
from  datetime import datetime, timedelta
import pandas as pd
import pickle
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from downcast import reduce
from numba import jit
import time
import gc
from downcast import reduce
import random
import warnings
from math import ceil
from tqdm import trange
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
warnings.filterwarnings("ignore")

In [None]:
# Large dataset, use 2 methods to save memory
@jit
def reduce_mem_usage(df, verbose=True, method='downcast'):
    start = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    if method == 'downcast':
        df = reduce(df)
    else:
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction) and time is '.format(end_mem, 100 * (
                    start_mem - end_mem) / start_mem) + str(time.time() - start))
    return df

In [None]:
FIRST_DAY = 846
# use the latest 3 years data only

remove_feature = ['id',
                  'state_id',
                  'store_id',
#                   'item_id',
                  'dept_id',
                  'cat_id',
                  'date','wm_yr_wk','d','sales']

cat_var = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
cat_var = list(set(cat_var) - set(remove_feature))

In [None]:
grid2_colnm = ['sell_price', 'price_max', 'price_min', 'price_std',
               'price_mean', 'price_norm', 'price_nunique', 'item_nunique',
               'price_momentum', 'price_momentum_m', 'price_momentum_y']

grid3_colnm = ['event_name_1', 'event_type_1', 'event_name_2',
               'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m',
               'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end']

lag_colnm = [ 'sales_lag_28', 'sales_lag_29', 'sales_lag_30',
             'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34',
             'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38',
             'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42',
             
             'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
             'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
             'rolling_std_60']

mean_enc_colnm = [
    
    'enc_store_id_dept_id_mean', 'enc_store_id_dept_id_std', 
    'enc_item_id_state_id_mean', 'enc_item_id_state_id_std',

]

In [None]:
########################### Model params
#################################################################################
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1
                } 

In [None]:
cvs = ['public']
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
CATS = ['HOBBIES', 'HOUSEHOLD', 'FOODS']
DEPTS = ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']
validation = {
    'cv' : [1885,1913],
    'public' : [1913, 1941],
    'private' : [1941, 1969]
}

In [None]:
%%time

traindata = pd.read_pickle("E:/traindata.pkl").reset_index(drop=True)

In [None]:
########################### Predict 
#################################################################################

rmsse_bycv = dict()

for cv in cvs:
    print('cv : day', validation[cv])
    
    pred_list = []
    for store in STORES:
        for state in DEPTS:

            print(store,state, 'start')
            start = time.time()
            grid_df = traindata[(traindata['store_id'] == store) & (traindata['dept_id'] == state)]

            model_var = grid_df.columns[~grid_df.columns.isin(remove_feature)]

            tr_mask = (grid_df['d'] <= validation[cv][0]) & (grid_df['d'] >= FIRST_DAY)
            vl_mask = (grid_df['d'] > validation[cv][0]) & (grid_df['d'] <= validation[cv][1])
            pr_mask = (grid_df['d'] > validation[cv][1])

            train_data = lgb.Dataset(grid_df[tr_mask][model_var], 
                           label=grid_df[tr_mask]['sales'])

            valid_data = lgb.Dataset(grid_df[vl_mask][model_var], 
                               label=grid_df[vl_mask]['sales'])

            m_lgb = lgb.train(lgb_params, train_data, valid_sets = [valid_data, train_data], verbose_eval=50, 
                              num_boost_round = 1000, early_stopping_rounds = 50)
            print('time: ' + str(time.time() - start))
            model_name = 'non_recur_model_'+store+'_'+state+'.bin'
            pickle.dump(m_lgb, open(model_name, 'wb'))
            
            indice_valid = grid_df[vl_mask].index.tolist()
            indice_pred = grid_df[pr_mask].index.tolist()
            prediction_valid = pd.DataFrame({'y_pred': m_lgb.predict(grid_df[vl_mask][model_var])})
            prediction_pred = pd.DataFrame({'y_pred': m_lgb.predict(grid_df[pr_mask][model_var])})
            prediction_valid.index = indice_valid
            prediction_pred.index = indice_pred


            del grid_df, train_data, valid_data, m_lgb, tr_mask, vl_mask
            gc.collect()

            grid_1 = pd.read_pickle('E:/grid_part_1.pkl')
            pd.concat([grid_1.iloc[indice_valid], prediction_valid], axis=1).pivot(index='id', columns='d', values='y_pred').reset_index()\
            .set_index('id').to_csv(f'submission_storeandcat_{store}_{state}_{cv}_valid.csv')
            pd.concat([grid_1.iloc[indice_pred], prediction_pred], axis=1).pivot(index='id', columns='d', values='y_pred') \
            .reset_index().set_index('id').to_csv(f'submission_onlystore_{store}_{cv}_pred.csv')


            del grid_1
            gc.collect()



In [None]:
import os
submission_pred = pd.read_csv('D:/HKUST/MAFS6010Z AI in Fintech/project3/sample_submission.csv').set_index('id').iloc[30490:]
submisssion_valid = pd.read_csv('D:/HKUST/MAFS6010Z AI in Fintech/project3/sample_submission.csv').set_index('id').iloc[:30490]
sub_id_valid = pd.DataFrame({'id':submisssion_valid.index.tolist()})
sub_id_pred = pd.DataFrame({'id':submission_pred.index.tolist()})

pri = [a for a in os.listdir() if 'onlystore' in a]
valid = [a for a in pri if 'valid' in a]
pred = [a for a in pri if 'pred' in a]

fcol = [f'F{i}' for i in range(1,29)]
sub_copy = submisssion_valid.copy()
for file in valid:
    temp = pd.read_csv(file).iloc[:,:29]
    temp.columns = ['id']+fcol
    sub_copy += sub_id_valid.merge(temp, how='left', on='id').set_index('id').fillna(0)
sub_copy.columns = fcol

sub_copy1 = submission_pred.copy()
for file in pred:
    temp = pd.read_csv(file).iloc[:,:29]
    temp.columns = ['id']+fcol
    sub_copy1 += sub_id_pred.merge(temp, how='left', on='id').set_index('id').fillna(0)
sub_copy1.columns = fcol

df = pd.concat([sub_copy, sub_copy1])
df.to_csv('E:/submission_nonrecursive_store_cat.csv')