# M5 Data Set

> Unfortunately License forbids redistribution of M5 data set, so go to kaggle and [download](https://www.kaggle.com/c/m5-forecasting-accuracy/data)

Should give you 5 files:
 - calendar.csv - Contains information about the dates on which the products are sold.
 - sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
 - sample_submission.csv - The correct format for submissions. Reference the Evaluation tab for more info.
 - sell_prices.csv - Contains information about the price of the products sold per store and date.
 - sales_train_evaluation.csv - Includes sales [d_1 - d_1941] (labels used for the Public leaderboard)

```
# Example transfer of files to a gcs bucket.
# Use gsutil to create bucket (note region where Vertex)
gsutil mb -l us-central1 -p lowes-vf gs://lowes-gcp-m5-vf
# Copy m5 files to bucket
gsutil cp -j *.csv gs://lowes-gcp-m5-vf
```

# Setup

* Install Vertex AI SDK, 
* Authenticate, 
* Upload dataset to your GCS bucket

In [None]:
import os

GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
REGION = 'us-central1'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"REGION: {REGION}")

In [None]:
BUCKET_NAME = "vertex-forecast-m5-rdf"
BUCKET_URI = f"gs://{BUCKET_NAME}"
GCS_SUBFOLDER = 'kaggle-data'

from google.cloud import aiplatform as vertex_ai

vertex_ai.init(
    project=PROJECT_ID, 
    staging_bucket=BUCKET_URI
)

from google.cloud import bigquery

bigquery_client = bigquery.Client(project=PROJECT_ID)

# Load sales data
* original data in wide format 
* convert to long format using python's `melt()`

In [None]:
import numpy as np
import pandas as pd

train_sales = pd.read_csv(f'gs://{BUCKET_NAME}/{GCS_SUBFOLDER}/sales_train_evaluation.csv')

In [None]:
sales = pd.melt(
    train_sales, 
    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
    var_name = 'd', 
    value_name = "sales"
)

del train_sales # Clean up some memory

In [None]:
sales.to_csv(f'gs://{BUCKET_NAME}/{GCS_SUBFOLDER}/sales_melted.csv', index=False)

In [None]:
!bq load --autodetect \
    --source_format=CSV \
    $REPLACE_W_YOUR_BQ_DATASET.calendar \
    gs://{REPLACE_W_YOUR_BUCKET/FOLDER}/calendar.csv

### Create a new dataset to put the tables into

In [None]:
BQ_DATASET = 'm5_us'
BQ_LOCATION = "US"

In [None]:
# create BQ dataset

dataset = bigquery.Dataset(f"`{PROJECT_ID}.{BQ_DATASET}`")

dataset.location = BQ_LOCATION

dataset = bigquery_client.create_dataset(bq_dataset, timeout=30) 

print("Created dataset {}.{}".format(bigquery_client.project, dataset.dataset_id))

### Save data to gcs so can use bq tool for importing

In [None]:
!bq load --autodetect \
    --source_format=CSV \
    $BQ_DATASET.calendar \
    gs://$BUCKET_NAME/$GCS_SUBFOLDER/calendar.csv

In [None]:
!bq load --autodetect \
    --source_format=CSV \
    $BQ_DATASET.sell_prices \
    gs://$BUCKET_NAME/$GCS_SUBFOLDER/sell_prices.csv

In [None]:
!bq load --autodetect \
    --source_format=CSV \
    $BQ_DATASET.sales_melted \
    gs://$BUCKET_NAME/$GCS_SUBFOLDER/sales_melted.csv

## Create Activity Tables

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.activity_all` AS
                            SELECT sal.item_id as product_id,
                                   sal.store_id as location_id,
                                   sal.sales as gross_quantity,
                                   cal.*,
                                   price.sell_price
                              FROM `{PROJECT_ID}.{BQ_DATASET}.sales_melted` sal
                            JOIN `{PROJECT_ID}.{BQ_DATASET}.calendar` cal ON sal.d = cal.d
                            JOIN `{PROJECT_ID}.{BQ_DATASET}.sell_prices` price
                              ON (price.item_id = sal.item_id AND 
                                  price.store_id = sal.store_id AND 
                                  price.wm_yr_wk = cal.wm_yr_wk) """

bigquery_client.query(json_extract_query).result()

### smaller actitivty table for testing, `actitivty_fold1`

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.activity_fold1`
        AS (SELECT 
            table_a.product_id,
            table_a.location_id,
            table_a.gross_quantity,
            TIMESTAMP(table_a.date) as datetime,
            table_a.weekday,
            table_a.wday,
            table_a.month,
            table_a.year,
            table_a.event_name_1,
            table_a.event_type_1,
            table_a.event_name_2,
            table_a.event_type_2,
            table_a.snap_CA,
            table_a.snap_TX,
            table_a.snap_WI,
            table_a.sell_price,
            CASE 
                WHEN table_a.date BETWEEN '2015-01-01' AND '2016-01-17' THEN 'TRAIN'
                WHEN table_a.date BETWEEN '2016-01-18' AND '2016-02-28' THEN 'VALIDATE'
                WHEN table_a.date BETWEEN '2016-02-29' AND '2016-03-27' THEN 'TEST' 
            END AS ml_use
            FROM `{PROJECT_ID}.{BQ_DATASET}.activity_all` as table_a
            WHERE table_a.date BETWEEN '2015-01-01' AND '2016-03-27'
            )
        ; """

bigquery_client.query(json_extract_query).result()

## Create Locations Table

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.locations`
                        AS (
                        SELECT 
                            DISTINCT store_id as location_id,
                            state_id
                        FROM 
                         `{PROJECT_ID}.{BQ_DATASET}.sales_melted`); """

bigquery_client.query(json_extract_query).result()

## Create Products Table

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.products`
AS (SELECT 
    DISTINCT item_id as product_id,
    dept_id,
    cat_id
    FROM `{PROJECT_ID}.{BQ_DATASET}.sales_melted`); """

bigquery_client.query(json_extract_query).result()

## Create Plan Table

### smaller Plan Table

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.plan_fold1`
AS (SELECT 
    table_a.product_id,
    table_a.location_id,
    table_a.gross_quantity,
    TIMESTAMP(table_a.date) as datetime,
    table_a.weekday,
    table_a.wday,
    table_a.month,
    table_a.year,
    table_a.event_name_1,
    table_a.event_type_1,
    table_a.event_name_2,
    table_a.event_type_2,
    table_a.snap_CA,
    table_a.snap_TX,
    table_a.snap_WI,
    table_a.sell_price,
    'HOLDOUT' as ml_use
    FROM `{PROJECT_ID}.{BQ_DATASET}.activity_all` as table_a
    WHERE table_a.date BETWEEN '2016-03-28' AND '2016-04-24');
    -- NULL prediction target
    UPDATE `{PROJECT_ID}.{BQ_DATASET}.plan_fold1`
    SET gross_quantity = NULL WHERE datetime < '2016-04-25'; """

bigquery_client.query(json_extract_query).result()

### larger Plan Table

In [None]:
json_extract_query = f"""CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.plan_table_all`
AS (
    SELECT 
        table_a.product_id,
        table_a.location_id,
        table_a.gross_quantity,
        TIMESTAMP(table_a.date) as datetime,
        table_a.weekday,
        table_a.wday,
        table_a.month,
        table_a.year,
        table_a.event_name_1,
        table_a.event_type_1,
        table_a.event_name_2,
        table_a.event_type_2,
        table_a.snap_CA,
        table_a.snap_TX,
        table_a.snap_WI,
        table_a.sell_price,
    FROM 
        `{PROJECT_ID}.{BQ_DATASET}.activity_all` as table_a
    WHERE 
        table_a.date BETWEEN '2016-01-01' AND '2016-04-24');
    -- NULL prediction target
    UPDATE 
        `{PROJECT_ID}.{BQ_DATASET}.plan_table_all`
    SET 
        gross_quantity = NULL WHERE datetime >= '2016-03-29';"""

bigquery_client.query(json_extract_query).result()