# M5 Forecasting

## Google Colab Setup
Commands to mount data from drive to colab environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
!cp /content/drive/My\ Drive/m5/*.pickle /content/

# Feature Engineering
ideas: 
- Calculate daily sales in USD:
data['sale_usd'] = data['sale'] * data['sell_price']

# Library
Functions and classes required for the modeling pipeline

## Save Model
Save model to disk, used by the logger to create checkpoints. 

Important because we will be running many notebook kernels in parallel.

In [4]:
import pickle

def save_model(model, fname):
    """Serialize model using given filename
    Args: 
        model (serializeable): Model to save
        fname (str): File name
    Returns:
        None  
    """
    pickle.dump(model, open(fname, 'wb'))

## Load Store Data
Load all of the data associated for the given store number

In [7]:
import pandas as pd
def load_store(store_id):
  """Loads from the project root directory"""
  return pd.read_pickle(f"./StoreID_{store_id}.pickle")

### Example
Read in DataFrame for store '0'

In [8]:
store_01_df = load_store('0')
store_01_df.head()

Unnamed: 0_level_0,id,item_id,dept_id,cat_id,store_id,state_id,sold,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2011-01-29,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,11101,1,1,2011,30,4,4,2,0,0,0,
2011-01-29,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,11101,1,1,2011,30,4,4,2,0,0,0,
2011-01-29,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,11101,1,1,2011,30,4,4,2,0,0,0,
2011-01-29,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,11101,1,1,2011,30,4,4,2,0,0,0,
2011-01-29,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,11101,1,1,2011,30,4,4,2,0,0,0,


## WRMSSE Scoring
The M5 uses a unique scoring function that incorporates weights of the last 28 days of the series.

These functions are from the [WRMSSE Evaluator with extra features](https://www.kaggle.com/dhananjay3/wrmsse-evaluator-with-extra-features) notebook.


In [13]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.auto import tqdm as tqdm

class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

### Example

In [14]:
!cp /content/drive/My\ Drive/m5/m5-forecasting-accuracy.zip /content

In [15]:
!unzip /content/m5-forecasting-accuracy.zip

Archive:  /content/m5-forecasting-accuracy.zip
  inflating: calendar.csv            
  inflating: sales_train_evaluation.csv  
  inflating: sales_train_validation.csv  
  inflating: sample_submission.csv   
  inflating: sell_prices.csv         


In [16]:
%%time

train_df = pd.read_csv('./sales_train_validation.csv')
calendar = pd.read_csv('./calendar.csv')
prices = pd.read_csv('./sell_prices.csv')

train_fold_df = train_df.iloc[:, :-28]
valid_fold_df = train_df.iloc[:, -28:].copy()

e = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
del train_fold_df, train_df, calendar, prices

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=42840.0), HTML(value='')))


CPU times: user 48 s, sys: 3.1 s, total: 51.1 s
Wall time: 51 s


In [18]:
valid_preds = np.random.randint(4, size=valid_fold_df.shape)
e.score(valid_preds)

2.509591594484603

## Model Log
Save a pickle of the model and enter the details into the log.

In [None]:
from datetime import datetime
import os

def log_model(model, score, model_name, description):
    """Save model and create an entry in 'model_log.log'
    Args: 
        model (can predict): Model to score and save
        model_name (str): Name to associate with model
        description (str): Any special information to associated with model
    Returns:
        None
    """
    timestamp = datetime.now()

    path = f'./{model_name}'
    fname = f'{path}/{timestamp.date()}-{timestamp.time()}-{score}.pickle'
    
    try:
      os.listdir(path)
    except:
      os.mkdir(path)
    
    save_model(model, fname)
    
    with open(f'./{model_name}/model_log.log', 'a') as log:
        entry = f"{timestamp}, {description}, {score}, {fname}\n"
        log.write(entry)

### Example
We will load the iris dataset into a stock logistic regression model from sklearn.

Then we can fit it and score its predictions before we try to log it. 

The log function will pickle the model and write an entry to our log data frame.

In [None]:
from sklearn import datasets
test_data = datasets.load_iris()

In [None]:
X = test_data.data 
y = test_data.target

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
score = model.score(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
log_model(model, score, 'logistic_regression', 'score works')

In [None]:
import pandas as pd
log_df = pd.read_csv('./logistic_regression/model_log.log', names=['timestamp', 'description', 'score', 'filename'])

In [None]:
log_df.head()

Unnamed: 0,timestamp,description,score,filename
0,2020-06-19 20:00:39.306699,initial fit,,./logistic_regression/2020-06-19-20:00:39.306...
1,2020-06-19 21:54:12.846319,initial fit,0.9733333333333334,./logistic_regression/2020-06-19-21:54:12.846...
2,2020-06-19 21:54:54.015912,score works,0.9733333333333334,./logistic_regression/2020-06-19-21:54:54.015...
