## 回帰モデルによる学習と予測
実行するにあたって入力必須の項目があるセルの初めと, 該当行の上部か右部に***入力必須 ***"と記載  
入力必須項目が存在する部分は "# =========" で囲っている

### 1. ライブラリの読み込み

In [6]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import datetime
import sys
import re
import gc
import glob
from tqdm import tqdm

import os
HOME = os.path.expanduser('~')
sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
# 下記は堀越の自分用便利ライブラリなのでググっても出てこない
import utils
from utils import logger_func, get_categorical_features, get_numeric_features, parallel_process
sys.path.append(f'{HOME}/kaggle/data_analysis')
from model.lightgbm_ex import lightgbm_ex as lgb_ex
from model.xray_wrapper import Xray_Cal
try:
    if logger:
        pass
except NameError:
    logger = logger_func()
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)

2018-12-14 02:27:06,603 utils 366 [INFO]    [logger_func] start 


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. データセットの読み込み
カラム名の日本語は基本NG *エラーは吐かないが、X-Rayの挙動がおかしくなるかも

In [7]:
# ***入力必須***

#========================================================================
# 作成するモデルの名前 * SKUの名前など何のモデルかわかるように
model_name = 'asahi_granmild' # ***入力必須***
# データセットの読み込み
df = pd.read_csv('../input/1213_142_sku_granmild_350_1_syoken_2.csv') # ***入力必須***
#========================================================================

#### データセットの確認

In [8]:
df.head()

Unnamed: 0,store_name,distributor_code,store_business_type,weekly_date,target_weekly_date,latest_week_no,quantity_latest_year,house_size_49_under_ratio,house_size_50_100_ratio,house_size_100_over_ratio,woman_20s_ratio,woman_30s_ratio,woman_40s_ratio,woman_50s_ratio,woman_60s_ratio,household_1_ratio,household_2_ratio,household_3_ratio,household_4_ratio,household_5_ratio,household_6_over_ratio,week_no,holiday_num,goods_name,quantity_week
0,ダイエー 月島店,51020702,(空白),2018-02-05,2018-05-07,21,118639.0,0.389936,0.553252,0.056812,0.135548,0.272773,0.281161,0.170651,0.139867,0.48274,0.267231,0.146408,0.083799,0.016591,0.003231,6,2,ｱｻﾋ ｸﾞﾗﾝﾏｲﾙﾄﾞ C350ML,19
1,ダイエー 月島店,51020702,(空白),2018-01-29,2018-04-30,22,118831.0,0.389936,0.553252,0.056812,0.135548,0.272773,0.281161,0.170651,0.139867,0.48274,0.267231,0.146408,0.083799,0.016591,0.003231,5,2,ｱｻﾋ ｸﾞﾗﾝﾏｲﾙﾄﾞ C350ML,16
2,ダイエー 月島店,51020702,(空白),2018-01-22,2018-04-23,23,118777.0,0.389936,0.553252,0.056812,0.135548,0.272773,0.281161,0.170651,0.139867,0.48274,0.267231,0.146408,0.083799,0.016591,0.003231,4,2,ｱｻﾋ ｸﾞﾗﾝﾏｲﾙﾄﾞ C350ML,33
3,ダイエー 月島店,51020702,(空白),2018-01-15,2018-04-16,24,118815.0,0.389936,0.553252,0.056812,0.135548,0.272773,0.281161,0.170651,0.139867,0.48274,0.267231,0.146408,0.083799,0.016591,0.003231,3,2,ｱｻﾋ ｸﾞﾗﾝﾏｲﾙﾄﾞ C350ML,4
4,ダイエー 武蔵村山店,51520190,(空白),2018-01-15,2018-04-16,24,142828.0,0.316473,0.547294,0.136233,0.134099,0.207807,0.224569,0.173823,0.259701,0.302873,0.299789,0.20013,0.149651,0.036195,0.011362,3,2,ｱｻﾋ ｸﾞﾗﾝﾏｲﾙﾄﾞ C350ML,30


### 3. 使用するカラムの選択
手調整しやすい様にべた打ち出来るようににしておく  
使用する説明変数のカラム名を指定するのではなく, 学習に含めないカラム名をリストに入れる

In [31]:
# データセットのカラム名を表示
print(df.columns)

Index(['store_name', 'distributor_code', 'store_business_type', 'weekly_date',
       'target_weekly_date', 'latest_week_no', 'quantity_latest_year',
       'house_size_49_under_ratio', 'house_size_50_100_ratio',
       'house_size_100_over_ratio', 'woman_20s_ratio', 'woman_30s_ratio',
       'woman_40s_ratio', 'woman_50s_ratio', 'woman_60s_ratio',
       'household_1_ratio', 'household_2_ratio', 'household_3_ratio',
       'household_4_ratio', 'household_5_ratio', 'household_6_over_ratio',
       'week_no', 'holiday_num', 'goods_name', 'quantity_week'],
      dtype='object')


In [32]:
# ***入力必須***

#========================================================================
# 目的変数のカラム名
target = 'quantity_week' # ***入力必須***
#========================================================================


#========================================================================
# 学習時のパーティション分割に使うカラム -> chain_store_code + distributor_code の連結カラムに修正
group_col_name='store_name' # ***入力必須***
#========================================================================

#========================================================================
# 学習に使わないカラムリスト
# データセットにないカラム名を入れておいてもエラーにはならない. 
# ***学習に使わないカラムはここに入れておかないと、学習に使われてしまうので注意***
# ***入力必須***
ignore_list = [
    target,
    'store_name',
    'distributor_code',
#     'store_business_type',
    'weekly_date',
    'target_weekly_date',
    'latest_week_no',
    'goods_name'
]
#========================================================================

print("説明変数:")
feature_list = [f for f in df.columns if f not in ignore_list]
[print(f"{i+1}: {f}") for i, f in enumerate(feature_list)]
print(f"目的変数: {target}")

説明変数:
1: store_business_type
2: quantity_latest_year
3: house_size_49_under_ratio
4: house_size_50_100_ratio
5: house_size_100_over_ratio
6: woman_20s_ratio
7: woman_30s_ratio
8: woman_40s_ratio
9: woman_50s_ratio
10: woman_60s_ratio
11: household_1_ratio
12: household_2_ratio
13: household_3_ratio
14: household_4_ratio
15: household_5_ratio
16: household_6_over_ratio
17: week_no
18: holiday_num
目的変数: quantity_week


### 4. 機械学習モデルのセットアップ

#### メモ
ここには色んな引数があるが、基本はDataRobotにある高度なオプション的な位置づけ。  
必須の設定は全てしてあるので、いじらなくても問題ないはず。

In [33]:
start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

# LGBM Args
seed = 1212
model_type = 'lgb'
metric = ['l2', 'rmse'][0] # 0:R^2 Score, 1:RMSE
early_stopping_rounds = 100
num_boost_round = 3000
learning_rate = 0.02
params = {
    'num_threads': -1,
    'num_leaves': 31,
    'objective':'regression',
    "boosting": "gbdt",
    'max_depth': -1,
    "min_child_samples": 20,
    "bagging_freq": 1,
    "subsample": 0.9 ,
    "colsample_bytree": 0.9,
    "lambda_l1": 0.1,
    "verbosity": -1,
    'random_seed': seed,
    'bagging_seed':seed,
    'feature_fraction_seed':seed,
    'data_random_seed':seed
}
params[learning_rate] = learning_rate
fold=5
fold_type='group'
dummie=0
oof_flg=False

### 5. モデルの学習と予測結果の出力
#### 出力される結果の見方
出力される内容は堀越の自分用ライブラリによるものなので、中身はいじれないと思っておいた方がよい  
CV score -> 交差検証の結果　※引数のmetricで指定した指標に応じる（今回はR2かRMSEのみに対応）  
Validation No ~ : metric (score) -> 各validation(検証)でのスコア. DataRobotでいうところの交差検証の左側に出ているスコアの1つ(DataRobotはこれが1つしか表示されない。ちなみに交差検証スコアはこの各validationスコアの平均)

In [34]:
# ***入力必須****

#========================================================================
# SKUに対する予測値まで出力したい場合は、下記のtest変数にデータセットを入力する（カラム名は学習用データセットと完全一致必須）
# 予測は行わず、学習と検証（精度の確認）のみで終わらせる場合は、test=[]とする
# ***どちらか入力必須。片方はコメントアウト****
test = [] # 予測は行わない
# test = pd.read_csv('../input/~.csv') # 予測を行う場合のデータセット
#========================================================================

# データセットを学習に使うカラムに絞る.
# TrainとTestセットのカラム順を揃える
if group_col_name in feature_list:
    train = df[feature_list+[target]]
else:
    train = df[feature_list+[group_col_name, target]]
if len(test)>0:
    test = test[feature_list]
    
# カラムソートが統一されるように連番を振っておく
# train.columns = [ f"{100+i}_{col}" if col not in ignore_list else col  for i, col in enumerate(train.columns)]
# if len(test)>0:
#     test.columns = [ f"{100+i}_{col}" if col not in ignore_list else col  for i, col in enumerate(test.columns)]

# LightGBMオブジェクトの取得（堀越が自分用に加工してあるクラス)
LGBM = lgb_ex(logger=logger, metric=metric, model_type=model_type, ignore_list=ignore_list, seed=seed)

# カテゴリカル変数のエンコーディング準備
category_map_dict = {}
cat_feature_list = get_categorical_features(df=train, ignore_list=ignore_list)
for cat_feat in cat_feature_list:
    category_map_dict[cat_feat] = train[cat_feat].to_frame()
     
# Data Check -> カテゴリカル変数のエンコーディングと分散0のfeatureを除外する
train, _, _ = LGBM.data_check(train=train, test=[], target=target, encode='ordinal')

# カテゴリカルのエンコーディングをデコードする辞書を作る
for cat_feat in cat_feature_list:
    category_map_dict[cat_feat]['label'] = train[cat_feat]
    category_map_dict[cat_feat] = category_map_dict[cat_feat].drop_duplicates().set_index('label').to_dict()[cat_feat]

if len(test)==0:
    # 学習と検証のみで予測は行わない
    LGBM = LGBM.cross_validation(
        train=train
        ,key=''
        ,target=target
        ,fold_type=fold_type
        ,fold=fold
        ,group_col_name=group_col_name
        ,params=params
        ,num_boost_round=num_boost_round
        ,early_stopping_rounds=early_stopping_rounds
    )
else:
    # 学習を行い、作成されたモデルにTestデータを入力して未来に対する予測値を取得する
    test = LGBM.decoder.fit_transform(test)
    LGBM = LGBM.cross_prediction(
        train=train
        ,test=test
        ,key=key
        ,target=target
        ,fold_type=fold_type
        ,fold=fold
        ,group_col_name=group_col_name
        ,params=params
        ,num_boost_round=num_boost_round
        ,early_stopping_rounds=early_stopping_rounds
        ,oof_flg=oof_flg
    )

LGBM.cv_feim.to_csv(f'../valid/{start_time[4:12]}_{model_name}_{model_type}_feat{len(LGBM.use_cols)}_CV{LGBM.cv_score}_lr{learning_rate}.csv', index=False)

2018-12-14 03:37:33,370 utils 213 [INFO]    [data_check] 
# DATA CHECK START
# CATEGORICAL FEATURE: ['store_business_type', 'quantity_latest_year', 'house_size_49_under_ratio', 'house_size_50_100_ratio', 'house_size_100_over_ratio', 'woman_20s_ratio', 'woman_30s_ratio', 'woman_40s_ratio', 'woman_50s_ratio', 'woman_60s_ratio', 'household_1_ratio', 'household_2_ratio', 'household_3_ratio', 'household_4_ratio', 'household_5_ratio', 'household_6_over_ratio']
# DATETIME FEATURE   : []
# CAT ENCODE         : ordinal
# ignore_list        : ['quantity_week', 'store_name', 'distributor_code', 'weekly_date', 'target_weekly_date', 'latest_week_no', 'goods_name']
         


KeyboardInterrupt: 

In [15]:
# %load_ext autoreload
# %autoreload 2
# from model.xray_wrapper import Xray_Cal
xray = Xray_Cal(logger=logger, ignore_list=ignore_list)

xray_result = pd.DataFrame()
for fold_num, (trn_idx, val_idx) in zip(tqdm(range(2)), list(LGBM.kfold)):
    xray.model = LGBM.fold_model_list[fold_num]
    xray, tmp_result = xray.get_xray(base_xray=train[LGBM.use_cols].iloc[trn_idx, :], fold_num=fold_num, parallel=False)
    tmp_result.rename(columns={'xray' : f'xray_{fold_num}'}, inplace=True)
    
    if len(xray_result):
        xray_result = xray_result.merge(tmp_result, how='inner', on=['N', 'feature', 'value'])
    else:
        xray_result = tmp_result
        
# X-Rayの平均を出力（主にこれを可視化する）        
xray_cols = [col for col in xray_result.columns if col.count('xray')]
xray_result['xray_avg'] = xray_result[xray_cols].mean(axis=1)

# カテゴリ特徴のエンコードを元に戻す
for cat_feat in cat_feature_list:
    xray_result[f"{cat_feat}_x"] = None
    xray_result.loc[xray_result.feature==cat_feat, f"{cat_feat}_x"] =  xray_result.loc[xray_result.feature==cat_feat, 'value'].map(lambda x: category_map_dict[cat_feat][int(x)])

  0%|          | 0/2 [00:00<?, ?it/s]2018-12-14 02:28:46,997 utils 182 [INFO]    [get_xray] FOLD: 0 
 50%|█████     | 1/2 [00:59<00:59, 59.88s/it]2018-12-14 02:29:43,577 utils 182 [INFO]    [get_xray] FOLD: 1 
100%|██████████| 2/2 [01:56<00:00, 58.89s/it]


In [21]:
# Feature Importanceを正規化
cv_feim = LGBM.cv_feim
cv_feim['goods_name'] = df['goods_name'].values[0]
cv_feim['avg_importance_norm'] = cv_feim['avg_importance'] / cv_feim['avg_importance'].max()
viz_result = xray_result.merge(LGBM.cv_feim, how='inner', on='feature')
viz_result.to_csv(f'../output/{start_time[4:12]}_asahi_{model_name}.csv', index=False)

In [None]:
result_stack[target] = result_stack[target].map(lambda x: category_map[x+1])
result_stack['prediction'] = result_stack['prediction'].map(lambda x: category_map[x+1])
result_stack.columns =  [category_map[int(col)+1] if str(type(col)).count('int') else col for col in result_stack.columns]
result_stack.to_csv(f'../output/{start_time[:11]}_jrw_result_stack_CV{LGBM.cv_score}.csv', index=False)