In [22]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
import numpy as np
import pandas as pd
from pathlib import Path

import sys
sys.path.append('../src')

import matplotlib.pyplot as plt
from tqdm import tqdm

import utils
from dataset import M5Dataset
import preprocessing
import features
from evaluater import WRMSSEEvaluator
import metrics
from runner import Runner

In [30]:
def reduce_mem_usage_without_date(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    excludes = ['date']
    for col in [col for col in df.columns if col not in ['date']]:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
X = utils.load_pickle(utils.FEATURE_DIR / 'baseline_features.pkl')

In [None]:
X = reduce_mem_usage_without_date(X)

In [4]:
model_config = utils.load_yaml(utils.CONFIG_DIR / 'model_configs' / '000_lgbm_baseline.yml')

In [74]:
feature_config = utils.load_yaml(utils.CONFIG_DIR / 'feature_configs' / '003_features_list.yml')

In [6]:
fold_indices = utils.load_pickle(utils.FEATURE_DIR / 'fold_indices.pkl')

In [75]:
all_features = feature_config['features']['original'] + \
        feature_config['features']['generated']
TARGET_COL = 'demand'

In [23]:
result_dir = Path('../results/20200524013848')

In [33]:
X_train = X[(X['date'] <= '2016-04-24')]
X_test = X[(X['date'] > '2016-04-24')]

In [24]:
runner = Runner(run_name='train_cv',
                x=X_train[all_features],
                y=X_train[TARGET_COL],
                model_cls=model_config['model_class'],
                params=model_config,
                metrics=metrics.rmse,
                save_dir=result_dir,
                fold_indices=fold_indices
                )

In [34]:
preds = runner.run_predict_all(X_test[all_features])

In [94]:
X.query('date >= "2016-4-25"')['date'].unique()

array(['2016-04-25T00:00:00.000000000', '2016-04-26T00:00:00.000000000',
       '2016-04-27T00:00:00.000000000', '2016-04-28T00:00:00.000000000',
       '2016-04-29T00:00:00.000000000', '2016-04-30T00:00:00.000000000',
       '2016-05-01T00:00:00.000000000', '2016-05-02T00:00:00.000000000',
       '2016-05-03T00:00:00.000000000', '2016-05-04T00:00:00.000000000',
       '2016-05-05T00:00:00.000000000', '2016-05-06T00:00:00.000000000',
       '2016-05-07T00:00:00.000000000', '2016-05-08T00:00:00.000000000',
       '2016-05-09T00:00:00.000000000', '2016-05-10T00:00:00.000000000',
       '2016-05-11T00:00:00.000000000', '2016-05-12T00:00:00.000000000',
       '2016-05-13T00:00:00.000000000', '2016-05-14T00:00:00.000000000',
       '2016-05-15T00:00:00.000000000', '2016-05-16T00:00:00.000000000',
       '2016-05-17T00:00:00.000000000', '2016-05-18T00:00:00.000000000',
       '2016-05-19T00:00:00.000000000', '2016-05-20T00:00:00.000000000',
       '2016-05-21T00:00:00.000000000', '2016-05-22

In [96]:
test_dates = ['2016-04-25', '2016-04-26', '2016-04-27', '2016-04-28',
              '2016-04-29', '2016-04-30', '2016-05-01', '2016-05-02',
              '2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
              '2016-05-07', '2016-05-08',
               '2016-05-09', '2016-05-10','2016-05-11', '2016-05-12',
               '2016-05-13', '2016-05-14', '2016-05-15', '2016-05-16',
               '2016-05-17', '2016-05-18', '2016-05-19', '2016-05-20', '2016-05-21', '2016-05-22']

In [103]:
agg_init_date = '2015-10-01'
# init_test_date = '2016-04-25'

In [98]:
org_feature = feature_config['features']['original'] + ['id', 'all_id', 'date', 'demand']

In [105]:
tmp_df = X[agg_init_date < X['date']]

In [None]:
for test_date in tqdm(test_dates):
    pred_tmp_df = tmp_df[tmp_df['date']<= test_date][org_feature].copy()
    tmp_feat = features.generate_features(pred_tmp_df)
    pred_tmp_df = pd.concat([pred_tmp_df, tmp_feat], axis=1)
    preds = runner.run_predict_all(pred_tmp_df[pred_tmp_df['date'] == test_date][all_features])
#     tmp_df[tmp_df['date']==test_date]['demand'] = preds
    row_indexer = tmp_df[tmp_df['date']==test_date].index
    tmp_df.loc[row_indexer, 'demand'] = preds

  0%|          | 0/28 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:13<00:26, 13.29s/it][A
 67%|██████▋   | 2/3 [00:28<00:13, 13.73s/it][A
100%|██████████| 3/3 [00:42<00:00, 14.07s/it]

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [01:00<02:00, 60.16s/it][A
 67%|██████▋   | 2/3 [02:04<01:01, 61.54s/it][A
100%|██████████| 3/3 [03:17<00:00, 65.91s/it]

  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [02:53<08:39, 173.04s/it][A
 50%|█████     | 2/4 [05:53<05:50, 175.19s/it][A
 75%|███████▌  | 3/4 [09:08<03:01, 181.17s/it][A
100%|██████████| 4/4 [12:41<00:00, 190.40s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
  4%|▎         | 1/28 [18:55<8:30:46, 1135.06s/it]
  0%|          |

In [79]:
tmp_feat = features.generate_features(tmp_df)

100%|██████████| 3/3 [00:52<00:00, 17.38s/it]
100%|██████████| 3/3 [04:40<00:00, 93.64s/it]
100%|██████████| 4/4 [16:35<00:00, 248.79s/it]


In [80]:
tmp_df = pd.concat([tmp_df, tmp_feat], axis=1)

In [85]:
preds = runner.run_predict_all(tmp_df[tmp_df['date'] == '2016-4-24'][all_features])

In [91]:
tmp_df[tmp_df['date']=='2016-4-24']['demand'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [93]:
row_indexer = tmp_df[tmp_df['date']=='2016-4-24'].index
tmp_df.loc[row_indexer, 'demand'] = preds