In [0]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')
base_pkl = '/content/drive/Shared drives/MyShare/M5/pkl/' # 入 固定
base_m5 = '/content/drive/Shared drives/MyShare/M5/'
Stores_path = '/content/drive/Shared drives/MyShare/M5/yk/Stores/'
Stores_H_path = '/content/drive/Shared drives/MyShare/M5/yk/Stores_Holiday/'
Model_path = '/content/drive/Shared drives/MyShare/M5/yk/Model/' # CA_1已经训练好的模型路径
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)


## 此处做一下修改，看看是否能够提高速度
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)

    pool.close()
    pool.join()
    return df



########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)

    df = df[df['store_id']==store]

    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    #####################################

    # calendar_date = pd.read_csv(base_m5+'calendar.csv')[['date', 'd']]

    # calendar_date['d'] = calendar_date['d'].apply(lambda x: x.split('_',2)[1])
    # calendar_date['d'] = calendar_date['d'].astype(np.int16)

    # df = df.merge(calendar_date, on='d', how='left')

    # ## 按照CA读取自然灾害文件
    # df_nature = pd.read_csv(base_m5+'us_disasters_m5.csv')
    # df_nature['declaration_date'] = df_nature['declaration_date'].apply(lambda x: x.split('T',2)[0])

    # df_nature = df_nature[['state','declaration_date','incident_type']]
    # df_nature.rename(columns={'declaration_date':'date'},inplace=True)

    # df_nature_CA = df_nature[df_nature['state']=='CA']

    # df_nature_CA = df_nature_CA[(df_nature_CA.incident_type=='Earthquake')]

    # ## 将自然灾害特征添加进去
    # df = df.merge(df_nature_CA[['date','incident_type']], on='date', how='left')

    # df['incident_type'] = df['incident_type'].astype('category')

    ####################################
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()
    if USE_AUX:
        model_path=Model_path
    else:
        model_path=''
    for store_id in STORES_IDS:
        ## 读取test数据，是保存的1913-100 ——1941的数据
        temp_df = pd.read_pickle('/content/drive/Shared drives/MyShare/M5/yk/'+'test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test

########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]

## 递归特征
def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]



import lightgbm as lgb

#  {'bagging_freq': 1,
#   'feature_fraction': 0.551,
#   'lambda_l2': 0.1268,
#   'learning_rate': 0.1985,
#   'max_bin': 149,
#   'min_data_in_leaf': 90790,
#   'n_estimators': 1200,
#   'num_leaves': 13670,
#   'sub_feature': 0.9796,
#   'sub_row': 0.9214,
#   'subsample': 0.884,
#   'subsample_freq': 2,
#   'tweedie_variance_power': 1.059},
# }


# {'params': {'bagging_freq': 0.11416346707885938,
#   'feature_fraction': 0.8129989099850855,
#   'lambda_l2': 0.13468152930048094,
#   'learning_rate': 0.03649707676236676,
#   'max_bin': 52.13500997821065,
#   'min_data_in_leaf': 8187.166548008022,
#   'n_estimators': 1027.8127261566087,
#   'num_leaves': 4067.4440080524473,
#   'sub_feature': 0.9174117673348705,
#   'sub_row': 0.9374157326506196,
#   'subsample': 0.9245883127404966,
#   'subsample_freq': 3.3207725360640246,
#   'tweedie_variance_power': 1.1230964763663194},
#  'target': -1.862147734953776}

lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                    'seed': 42,
                } 





                
########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
#END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = True               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id','date','wm_yr_wk','d',TARGET]
                   
mean_features   = ['enc_cat_id_mean','enc_cat_id_std','enc_dept_id_mean','enc_dept_id_std','enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
#ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE     = base_pkl + 'grid_part_1.pkl'
PRICE    = base_pkl + 'grid_part_2.pkl'
#CALENDAR = base_pkl+'grid_part_3.pkl'

#CALENDAR = base_pkl+'grid_part_3_holidays.pkl'

CALENDAR = '/content/drive/Shared drives/MyShare/M5/Model_holiday/grid_part_3_holidays.pkl'

LAGS     = base_pkl + 'lags_df_28.pkl'
MEAN_ENC = base_pkl + 'mean_encoding_df.pkl'


# AUX(pretrained) Models paths
#AUX_MODELS = '../input/m5-aux-models/'


#STORES ids
# STORES_IDS = pd.read_csv(base_m5+'sales_train_validation.csv')['store_id']
# STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)] # 28，28+15
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])
   

########################### Aux Models
# If you don't want to wait hours and hours
# to have result you can train each store 
# in separate kernel and then just join result.

# If we want to use pretrained models we can 
## skip training 
## (in our case do dummy training
##  to show that we are good with memory
##  and you can safely use this (all kernel) code)
USE_AUX = False
if USE_AUX:
    lgb_params['n_estimators'] = 2

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
STORES_IDS = ['CA_1']
END_TRAIN = 1913

In [0]:
# !rm /content/drive/Shared\ drives/MyShare/M5/TEST_model/* -rf
pkl_bin_path = '/content/drive/Shared drives/MyShare/M5/yk/pkl_bin/'
########################### Train Models #################################################################################
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)

    #######################################################
    # grid_df['incident_type'] = grid_df['incident_type'].apply(lambda x: np.nan if x!='Earthquake' else 'Earthquake')
    # grid_df['incident_type'] = grid_df['incident_type'].astype('category')
    # grid_df.drop(columns='incident_type',inplace=True)

    # ## 删除某个特征查看效果
    # remove = ['incident_type']
    # features_columns=[cols for cols in features_columns if cols not in remove]
    #######################################################

        #####################################

    # calendar_df = pd.read_csv(base_m5+'calendar.csv')

    # calendar_win = calendar_df[['event_name_2','event_name_1','d']]
    # calendar_win['d'] = calendar_win['d'].apply(lambda x: x.split('_',2)[1])


    # ## 添加特征
    # even = calendar_df[['event_name_2','event_name_1']]
    # even = even.fillna('')

    # even1 = even.shift(-1)
    # even1 = even1.fillna('')
    # even2 = even.shift(-2)
    # even2 = even2.fillna('')
    # even_sum = even1+even2
    # even_sum['event_name_2'] = even_sum['event_name_2'].apply(lambda x: np.nan if x=='' else x)
    # even_sum['event_name_1'] = even_sum['event_name_1'].apply(lambda x: np.nan if x=='' else x)

    # calendar_win['event_name_1_win'] = even_sum['event_name_1']
    # calendar_win['event_name_2_win'] = even_sum['event_name_2']
    # calendar_win.drop(columns=['event_name_1','event_name_2'],inplace=True)

    # calendar_win['event_name_1_win'] = calendar_win['event_name_1_win'].astype('category')
    # calendar_win['event_name_2_win'] = calendar_win['event_name_2_win'].astype('category')
    # calendar_win['d'] = calendar_win['d'].astype(np.int16)

    # grid_df = grid_df.merge(calendar_win, on='d', how='left')

    # features_columns=features_columns+['event_name_1_win','event_name_2_win']
    calendar_win = pd.read_csv(Stores_H_path + 'Holiday_Effect_CA_1.csv') # 只有id和d两列，d为字符串d_1-d_1969
    calendar_win['d'] = calendar_win['d'].apply(lambda x: x.split('_',1)[1]).astype(np.int16) # d_1 变为 1，数据类型由int64转化为int16
    calendar_win['Holiday_Effect'] = calendar_win['Holiday_Effect'].astype('category')

    grid_df = grid_df.merge(calendar_win, on='d', how='left') # 把假期效应加入到固定的grid_df
    features_columns = features_columns + ['Holiday_Effect']

    ####################################
    # print(features_columns)
    # print(grid_df) # [4361148 rows x 78 columns] d：1——1941

    train_mask = grid_df['d'] <= END_TRAIN # 1 —— 1913
    valid_mask = train_mask & (grid_df['d'] > (END_TRAIN-P_HORIZON)) # 1913-28 —— 1913
    preds_mask = grid_df['d'] > (END_TRAIN - 100) # 1913-100 —— 1941 这个数据集是为了后面的循环预测
  
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], label = grid_df[train_mask][TARGET]) # 训练 CA_1 的数据
    train_data.save_binary(pkl_bin_path + 'train_data_CA_1.bin') # 存为二进制文件，可以加快训练速度
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label = grid_df[valid_mask][TARGET]) 
    
    # 保存部分数据集以供以后预测，删除我们需要递归计算的特性 
    grid_df = grid_df[preds_mask].reset_index(drop=True) # 1913-100——1941 
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]  
    grid_df = grid_df[keep_cols] # 在测试集中去掉了递归特征，比如说：rolling_mean_tmp_1_7、rolling_mean_tmp_1_14、...

    grid_df.to_pickle(pkl_bin_path +'test_'+store_id+'.pkl') 
    del grid_df
    
    seed_everything(SEED) # 限制住模型本身的随机性，方便下次对比
    estimator = lgb.train(lgb_params, train_data, valid_sets = [valid_data], verbose_eval = 100) # 每迭代100次，打印一次结果
    
    model_name = Model_path+'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb')) 
    # 把训练好的模型对象封装到二进制文件中，open是打开要写入的文件，必须以二进制可写模式打开，即“wb”
    # 读的时候，也要二进制读取 "rb"，即 pickle.load(open(model_name, 'rb')))

    # 删除临时文件和对象以释放一些HDD空间和内存，HDD（Hard Disk Drive）硬盘
    ############### 此处非常重要，如果没有删除，就导致错误 ################################################

    # !rm /content/drive/Shared drives/MyShare/M5/yk/train_data_CA_1.bin # Linux指令，删除这个文件
    del train_data, valid_data, estimator # 模型已经封装成二进制文件
    gc.collect() # 清理内存
    
    MODEL_FEATURES = features_columns # 为了后面的预测，保持模型特征
    print(MODEL_FEATURES)

Train CA_1
[100]	valid_0's rmse: 2.02733
[200]	valid_0's rmse: 2.00334
[300]	valid_0's rmse: 1.99298
[400]	valid_0's rmse: 1.98457
[500]	valid_0's rmse: 1.97814
[600]	valid_0's rmse: 1.97181
[700]	valid_0's rmse: 1.96635
[800]	valid_0's rmse: 1.9609
[900]	valid_0's rmse: 1.95654
[1000]	valid_0's rmse: 1.95149
[1100]	valid_0's rmse: 1.94612
[1200]	valid_0's rmse: 1.94165
[1300]	valid_0's rmse: 1.93671
[1400]	valid_0's rmse: 1.93218
['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'Holiday', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 

In [0]:
MODEL_FEATURES = ['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'Holiday', 'tm_d', 'tm_w', 'tm_m', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14', 'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60', 'rolling_std_60', 'rolling_mean_180', 'rolling_std_180', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14', 'rolling_mean_tmp_1_30', 'rolling_mean_tmp_1_60', 'rolling_mean_tmp_7_7', 'rolling_mean_tmp_7_14', 'rolling_mean_tmp_7_30', 'rolling_mean_tmp_7_60', 'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14', 'rolling_mean_tmp_14_30', 'rolling_mean_tmp_14_60', 'Holiday_Effect']

In [0]:
USE_AUX = True
END_TRAIN = 1913

In [0]:
########################### Predict #################################################################################
#################################### 注意特征的摆放顺序不能随意更改  ###########################################
all_preds = pd.DataFrame() # 用来存储预测值

base_test = get_base_test() # 用一小部分训练数据加入测试数据集，以形成递归特征，1913-100 — 1941 390216 rows × 67 columns

main_time = time.time() # Timer to measure predictions time 

for PREDICT_DAY in range(1,29): # 循环预测
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time() # 开始的时间
    grid_df = base_test.copy() 
    # 用临时grid_df（测试集的复制）来计算滚动滞后，由于滚动滞后最多有14_60，所以至少要取得测试集需要有75天的数据，干脆就取100天
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1) 
    # 纵向加入12个滚动滞后特征 390216 rows × 79 columns
    # df_parallelize_run(make_lag_roll, ROLS_SPLIT) # 390216 rows × 12 columns 多线程生成12个滚动滞后特征，速度很快
    for store_id in STORES_IDS:
        
        # 单个模型，对每一天/商店对进行预测 
        model_bin = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 

        if USE_AUX:
          model_path = Model_path + model_bin

        estimator = pickle.load(open(model_path, 'rb')) # 二进制读入把模型加载进来
        
        ## 某一天
        day_mask = base_test['d'] == (END_TRAIN+PREDICT_DAY) # 1913+1=1914、1913+2=1915、...、1913+28=1941
        ## 某个店
        store_mask = base_test['store_id']==store_id # CA_1
        ## 某一天某个店
        mask = (day_mask) & (store_mask) # CA_1 且 1914 的地方为True  Length: 390216, dtype: bool
        ## 某一天某个店
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES]) # 用每个产品在1914这一天的特征进行预测这一天的销量
        # 3049 rows × 74 columns 只保留Ture的地方，做预测要保证训练和预测的特征一致，去掉不参与训练的那几列特征
        # 先预测CA_1店在1914这一天的产品销量

    # Make good column naming and add to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]  # 3049 * 2
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]  
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left') 
    else:
        all_preds = temp_df.copy() # 1914这一天copy，剩下的27天依次merge
        
    print('#'*10, '%0.2f min round |' % ((time.time() - start_time) / 60),
            '%0.2f min total |' % ((time.time() - main_time) / 60),
            '%0.2f day sales |' % (temp_df['F' + str(PREDICT_DAY)].sum())) # 预测得到每一天的所有商品销量的总和
    del temp_df # 一次循环之后，删除，给下次使用腾出来内存空间
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
########## 0.49 min round | 0.49 min total | 4116.56 day sales |
Predict | Day: 2
########## 0.48 min round | 0.97 min total | 3738.67 day sales |
Predict | Day: 3
########## 0.49 min round | 1.46 min total | 3663.38 day sales |
Predict | Day: 4
########## 0.48 min round | 1.94 min total | 3798.94 day sales |
Predict | Day: 5
########## 0.48 min round | 2.42 min total | 4526.33 day sales |
Predict | Day: 6
########## 0.46 min round | 2.89 min total | 6043.89 day sales |
Predict | Day: 7
########## 0.48 min round | 3.37 min total | 6808.56 day sales |
Predict | Day: 8
########## 0.48 min round | 3.85 min total | 4928.41 day sales |
Predict | Day: 9
########## 0.48 min round | 4.33 min total | 4609.27 day sales |
Predict | Day: 10
########## 0.48 min round | 4.81 min total | 4458.10 day sales |
Predict | Day: 11
########## 0.48 min round | 5.30 min total | 4290.37 day sales |
Predict | Day: 12
########## 0.48 min round | 5.78 min total | 4965.39 day sales |
Predict | Day

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.858026,0.736424,0.710501,0.750910,0.903622,1.030827,1.142052,0.870518,0.916098,0.822756,0.740306,0.817063,1.092705,0.797860,0.776322,0.824381,0.775560,0.756738,0.748712,1.062020,0.942604,0.846022,0.770023,0.785697,0.846538,0.848323,1.037180,0.998146
1,HOBBIES_1_002_CA_1_validation,0.213502,0.206127,0.171992,0.194366,0.207437,0.275446,0.274132,0.233109,0.225528,0.221375,0.220344,0.237391,0.327178,0.266936,0.193383,0.188430,0.175660,0.171889,0.195687,0.240199,0.233123,0.169337,0.164292,0.166675,0.172863,0.174989,0.250113,0.278019
2,HOBBIES_1_003_CA_1_validation,0.380922,0.380822,0.402524,0.383088,0.577067,0.705437,0.693148,0.478950,0.444713,0.460816,0.445194,0.542801,0.708476,0.572568,0.410054,0.429019,0.437200,0.442711,0.549182,0.699388,0.691388,0.496594,0.429684,0.430646,0.475944,0.572817,0.677177,0.676065
3,HOBBIES_1_004_CA_1_validation,1.564048,1.228647,1.279002,1.485199,1.962663,2.986018,3.189570,1.626414,1.416182,1.560751,1.564980,1.718888,2.852906,2.468059,1.675410,1.325957,1.325845,1.288900,1.911911,2.499243,3.245875,1.558003,1.403850,1.268063,1.333348,1.946371,2.992751,3.436288
4,HOBBIES_1_005_CA_1_validation,0.894509,0.836756,0.817500,0.843557,1.041383,1.496693,1.642327,0.978472,0.972878,0.980333,0.903330,1.046560,1.363357,1.117480,0.924529,0.995643,0.968533,0.962426,1.162740,1.450604,1.560491,1.006538,0.820391,0.873971,0.882312,1.094219,1.434185,1.464892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,FOODS_3_823_CA_1_validation,1.071945,0.904212,0.925663,0.951932,1.224675,1.477422,2.125632,1.394462,1.274183,1.197756,1.262224,1.446943,1.845973,1.397031,1.412849,1.238523,1.159745,1.145631,1.181510,1.571044,1.573944,1.180153,1.139459,1.079889,1.157656,1.330482,1.425382,1.454897
3045,FOODS_3_824_CA_1_validation,0.777027,0.728018,0.712062,0.684982,0.751586,0.966542,1.091016,0.951573,0.852642,0.828789,0.862087,0.838562,1.061265,0.781460,0.750874,0.774328,0.763552,0.705511,0.705186,0.785791,0.836114,0.657607,0.623024,0.677717,0.718749,0.705201,0.934587,0.985465
3046,FOODS_3_825_CA_1_validation,0.937415,0.777227,0.737070,0.695209,0.903903,1.227019,1.681742,1.107249,1.003291,1.095152,0.836842,1.097842,1.529910,1.158279,1.148889,1.092691,0.863092,0.835825,1.067846,1.288878,1.500102,1.017786,0.845272,0.773811,0.734452,0.917902,1.248705,1.260537
3047,FOODS_3_826_CA_1_validation,1.177371,1.021021,1.083338,1.101680,1.207514,1.649624,1.729284,1.545890,1.452185,1.341978,1.238678,1.636893,1.752969,1.160708,1.643339,1.331563,1.226184,1.203379,1.515954,1.747070,1.668864,1.513406,1.352802,1.272309,1.202869,1.362558,1.478327,1.572113


In [0]:
CA1 = all_preds # 把上面预测得到的结果存为CA1
CA1.to_csv(Stores_path +'CA1.csv',index=False) # index=False 表示把原索引也保存起来，作为第一列

In [0]:
My_test = pd.read_csv(base_m5 + 'My_pre.csv') 
print(My_test.columns[0])
print(My_test)
a = My_test.drop(columns= ['Unnamed: 0'])
a

Unnamed: 0
       Unnamed: 0                             id  ...       F27       F28
0               0  HOBBIES_1_001_CA_1_validation  ...  1.106234  1.074874
1               1  HOBBIES_1_002_CA_1_validation  ...  0.297041  0.277692
2               2  HOBBIES_1_003_CA_1_validation  ...  0.686396  0.651168
3               3  HOBBIES_1_004_CA_1_validation  ...  2.854923  3.324280
4               4  HOBBIES_1_005_CA_1_validation  ...  1.547502  1.531163
...           ...                            ...  ...       ...       ...
30485       30485    FOODS_3_823_WI_3_validation  ...  0.427658  0.452884
30486       30486    FOODS_3_824_WI_3_validation  ...  0.324765  0.326271
30487       30487    FOODS_3_825_WI_3_validation  ...  0.855007  0.770190
30488       30488    FOODS_3_826_WI_3_validation  ...  1.076613  1.154267
30489       30489    FOODS_3_827_WI_3_validation  ...  2.046243  2.058829

[30490 rows x 30 columns]


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.841742,0.757436,0.728389,0.761665,1.000136,1.165551,1.208051,0.863785,0.957409,0.854942,0.791007,0.895135,1.075710,0.912531,0.849813,0.865339,0.797540,0.737261,0.867501,1.077549,1.094876,0.867376,0.856909,0.832007,0.831587,0.986544,1.106234,1.074874
1,HOBBIES_1_002_CA_1_validation,0.203211,0.189806,0.173399,0.199822,0.230208,0.313719,0.312664,0.251374,0.229473,0.189454,0.183435,0.220875,0.266777,0.252461,0.236547,0.203273,0.205744,0.213234,0.227807,0.287858,0.289828,0.218054,0.202842,0.193763,0.194609,0.207611,0.297041,0.277692
2,HOBBIES_1_003_CA_1_validation,0.454542,0.445043,0.445441,0.441526,0.577491,0.763762,0.703281,0.472421,0.453774,0.482746,0.452602,0.537261,0.711261,0.587494,0.439546,0.441157,0.389276,0.436310,0.528160,0.681939,0.653393,0.475834,0.434501,0.428781,0.449918,0.568355,0.686396,0.651168
3,HOBBIES_1_004_CA_1_validation,1.585677,1.316821,1.288205,1.562391,1.993648,3.072055,3.293523,1.607361,1.322533,1.328573,1.584307,1.706179,2.863095,2.423045,1.571796,1.453240,1.351362,1.423681,1.923382,2.477563,3.110499,1.612639,1.451468,1.308784,1.395521,1.843039,2.854923,3.324280
4,HOBBIES_1_005_CA_1_validation,0.880313,0.749764,0.771451,0.849284,1.119681,1.400065,1.621155,0.945945,0.923281,0.983670,0.897931,1.002446,1.462214,1.067597,0.938336,0.962994,0.848742,0.914662,1.034251,1.449809,1.501207,0.963205,0.830177,0.876086,0.870730,1.103279,1.547502,1.531163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0.343249,0.337044,0.312435,0.305590,0.369307,0.439830,0.390787,0.440919,0.429687,0.325614,0.441604,0.429158,0.526028,0.581849,0.475181,0.375862,0.475694,0.458055,0.448120,0.542798,0.605173,0.451962,0.401201,0.353336,0.340917,0.379655,0.427658,0.452884
30486,FOODS_3_824_WI_3_validation,0.304118,0.271227,0.249701,0.248959,0.296205,0.351925,0.316012,0.395658,0.392394,0.292009,0.343717,0.354665,0.416399,0.420076,0.410827,0.334240,0.369710,0.364893,0.333732,0.493928,0.525711,0.361954,0.291856,0.256410,0.241357,0.239221,0.324765,0.326271
30487,FOODS_3_825_WI_3_validation,0.634179,0.526229,0.479149,0.489881,0.620834,0.704025,0.807710,1.195724,1.164279,0.783981,1.082302,1.254445,1.167874,1.212325,1.310124,0.806256,1.155137,1.193137,0.890896,1.283364,1.512414,1.007699,0.734683,0.753961,0.666271,0.680109,0.855007,0.770190
30488,FOODS_3_826_WI_3_validation,0.918011,0.905366,0.792682,0.828906,0.983269,1.187842,1.036656,1.132715,1.153527,0.863996,1.200988,1.077779,1.245104,1.204511,1.174640,0.916391,1.027401,1.083202,0.961620,1.354991,1.374730,0.974573,0.873504,0.830928,0.777575,0.926294,1.076613,1.154267


## 本地验证

In [0]:
CA1 = pd.read_csv(Stores_path +'CA1.csv')

My_test = pd.read_csv(base_m5 + 'My_pre.csv') # 保存的时候，没有设置index = false，所以读取的时候有一个索引列，列名为 "Unnamed: 0"
My_test_28 = My_test[['id'] + [f'F{i}' for i in range(1,29)]] # 等价于My_test_28 = My_test.drop(columns = 'Unnamed: 0')

store = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']

My_test_CA1 = My_test_28[My_test_28.id.str.endswith('CA_1_validation')] 
My_test_CA2 = My_test_28[My_test_28.id.str.endswith('CA_2_validation')]
My_test_CA3 = My_test_28[My_test_28.id.str.endswith('CA_3_validation')]
My_test_CA4 = My_test_28[My_test_28.id.str.endswith('CA_4_validation')]
My_test_TX1 = My_test_28[My_test_28.id.str.endswith('TX_1_validation')]
My_test_TX2 = My_test_28[My_test_28.id.str.endswith('TX_2_validation')]
My_test_TX3 = My_test_28[My_test_28.id.str.endswith('TX_3_validation')]
My_test_WI1 = My_test_28[My_test_28.id.str.endswith('WI_1_validation')]
My_test_WI2 = My_test_28[My_test_28.id.str.endswith('WI_2_validation')]
My_test_WI3 = My_test_28[My_test_28.id.str.endswith('WI_3_validation')]

Compute_rate_CA1 = My_test_CA1.copy()
Compute_rate_CA2 = My_test_CA2.copy()
Compute_rate_CA3 = My_test_CA3.copy()
Compute_rate_CA4 = My_test_CA4.copy()
Compute_rate_TX1 = My_test_TX1.copy()
Compute_rate_TX2 = My_test_TX2.copy()
Compute_rate_TX3 = My_test_TX3.copy()
Compute_rate_WI1 = My_test_WI1.copy()
Compute_rate_WI2 = My_test_WI2.copy()
Compute_rate_WI3 = My_test_WI3.copy()

# 以上这些商店对应的索引不发生改变
all_preds1 = pd.concat([CA1, Compute_rate_CA2, Compute_rate_CA3, Compute_rate_CA4,
             Compute_rate_TX1, Compute_rate_TX2, Compute_rate_TX3, Compute_rate_WI1,
             Compute_rate_WI2, Compute_rate_WI3],axis=0).reset_index(drop = True) # reset_index(drop = True)加上这个就不用考虑索引问题了
all_preds1

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.858026,0.736424,0.710501,0.750910,0.903622,1.030827,1.142052,0.870518,0.916098,0.822756,0.740306,0.817063,1.092705,0.797860,0.776322,0.824381,0.775560,0.756738,0.748712,1.062020,0.942604,0.846022,0.770023,0.785697,0.846538,0.848323,1.037180,0.998146
1,HOBBIES_1_002_CA_1_validation,0.213502,0.206127,0.171992,0.194366,0.207437,0.275446,0.274132,0.233109,0.225528,0.221375,0.220344,0.237391,0.327178,0.266936,0.193383,0.188430,0.175660,0.171889,0.195687,0.240199,0.233123,0.169337,0.164292,0.166675,0.172863,0.174989,0.250113,0.278019
2,HOBBIES_1_003_CA_1_validation,0.380922,0.380822,0.402524,0.383088,0.577067,0.705437,0.693148,0.478950,0.444713,0.460816,0.445194,0.542801,0.708476,0.572568,0.410054,0.429019,0.437200,0.442711,0.549182,0.699388,0.691388,0.496594,0.429684,0.430646,0.475944,0.572817,0.677177,0.676065
3,HOBBIES_1_004_CA_1_validation,1.564048,1.228647,1.279002,1.485199,1.962663,2.986018,3.189570,1.626414,1.416182,1.560751,1.564980,1.718888,2.852906,2.468059,1.675410,1.325957,1.325845,1.288900,1.911911,2.499243,3.245875,1.558003,1.403850,1.268063,1.333348,1.946371,2.992751,3.436288
4,HOBBIES_1_005_CA_1_validation,0.894509,0.836756,0.817500,0.843557,1.041383,1.496693,1.642327,0.978472,0.972878,0.980333,0.903330,1.046560,1.363357,1.117480,0.924529,0.995643,0.968533,0.962426,1.162740,1.450604,1.560491,1.006538,0.820391,0.873971,0.882312,1.094219,1.434185,1.464892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,0.343249,0.337044,0.312435,0.305590,0.369307,0.439830,0.390787,0.440919,0.429687,0.325614,0.441604,0.429158,0.526028,0.581849,0.475181,0.375862,0.475694,0.458055,0.448120,0.542798,0.605173,0.451962,0.401201,0.353336,0.340917,0.379655,0.427658,0.452884
30486,FOODS_3_824_WI_3_validation,0.304118,0.271227,0.249701,0.248959,0.296205,0.351925,0.316012,0.395658,0.392394,0.292009,0.343717,0.354665,0.416399,0.420076,0.410827,0.334240,0.369710,0.364893,0.333732,0.493928,0.525711,0.361954,0.291856,0.256410,0.241357,0.239221,0.324765,0.326271
30487,FOODS_3_825_WI_3_validation,0.634179,0.526229,0.479149,0.489881,0.620834,0.704025,0.807710,1.195724,1.164279,0.783981,1.082302,1.254445,1.167874,1.212325,1.310124,0.806256,1.155137,1.193137,0.890896,1.283364,1.512414,1.007699,0.734683,0.753961,0.666271,0.680109,0.855007,0.770190
30488,FOODS_3_826_WI_3_validation,0.918011,0.905366,0.792682,0.828906,0.983269,1.187842,1.036656,1.132715,1.153527,0.863996,1.200988,1.077779,1.245104,1.204511,1.174640,0.916391,1.027401,1.083202,0.961620,1.354991,1.374730,0.974573,0.873504,0.830928,0.777575,0.926294,1.076613,1.154267


In [0]:
################################## 本地wrmse #############################################################
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import gc

# 加载前面预先计算好的各个权重
file_pass = '/content/drive/Shared drives/MyShare/M5/Metric/'
sw_df = pd.read_pickle(file_pass+'sw_df.pkl')
S = sw_df.s.values
W = sw_df.w.values
SW = sw_df.sw.values
roll_mat_df = pd.read_pickle(file_pass+'roll_mat_df.pkl')
roll_index = roll_mat_df.index
roll_mat_csr = csr_matrix(roll_mat_df.values)


def rollup(v):
    return (v.T*roll_mat_csr.T).T

# 计算 WRMSSE 评估指标
def wrmsse(preds, y_true, score_only=False, s = S, w = W, sw=SW):
    '''
    preds - Predictions: pd.DataFrame of size (30490 rows, N day columns)
    y_true - True values: pd.DataFrame of size (30490 rows, N day columns)
    sequence_length - np.array of size (42840,)
    sales_weight - sales weights based on last 28 days: np.array (42840,)
    '''
    
    if score_only:
        return np.sum(
                np.sqrt(
                    np.mean(
                        np.square(rollup(preds.values-y_true.values))
                            ,axis=1)) * sw )*(1/12)
    else: 
        score_matrix = (np.square(rollup(preds.values-y_true.values)) * np.square(w)[:, None])  / s[:, None]
        score = np.sum(np.sqrt(np.mean(score_matrix,axis=1)))*(1/12)
        return score, score_matrix

In [0]:
## 预测值
cols = [f'F{i}' for i in range(1,29)]
pred = all_preds1.copy()
pred = pred[cols] # 没有id列，用作wrmsse的参数

## 真值
true = pd.read_csv(base_m5 + 'sales_train_evaluation.csv')
col2 = [f'd_{i}' for i in range(1914, 1942)]
true_lable = true.copy()
true_lable_1 = true_lable[col2]

## 进行测试
wrmsse(true_lable_1, pred, score_only=True) # CA_1加上NBA总决赛的特征之后

0.4721442767751499

In [0]:
# cp /content/drive/Shared\ drives/MyShare/M5/Ten_stores/CA3.ipynb /content/drive/Shared\ drives/MyShare/M5/Ten_stores/CA4.ipynb -rf 
# # Linux cp命令主要用于复制文件或目录。