# Probabilistic Inference

In [None]:
# !pip3 install {USER_FLAG} google-cloud-aiplatform kfp google-cloud-pipeline-components --upgrade
# !pip3 install --no-cache-dir {USER_FLAG} PyYAML==5.3.1 

In [None]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

## Load notebook config

> use the prefix defined in 00-env-setup

In [None]:
CREATE_NEW_ASSETS = False

In [None]:
# naming convention for all cloud resources
VERSION        = "v1"              # TODO
PREFIX         = f'forecast-refresh-{VERSION}'   # TODO

print(f"PREFIX = {PREFIX}")

In [None]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# ! gcloud config set project $PROJECT_ID

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-gcs'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)

In [None]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI

## Imports

In [None]:
EXPERIMENT_NAME = f"{PREFIX}-v1"

print(EXPERIMENT_NAME)

In [None]:
# Import required modules
import json
import datetime
from typing import Any, Dict, List, Optional

from google.cloud import aiplatform, storage, bigquery

# from google_cloud_pipeline_components.types.artifact_types import VertexDataset
from google_cloud_pipeline_components.preview.automl.forecasting import \
    utils as automl_forecasting_utils


# Construct a BigQuery client object.
bq_client = bigquery.Client(project=PROJECT_ID)
aiplatform.init(experiment=EXPERIMENT_NAME, project=PROJECT_ID, location=REGION)

import sys
sys.path.append("..")
from src import helpers

In [None]:
if CREATE_NEW_ASSETS:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    ds.location = BQ_LOCATION
    ds = bqclient.create_dataset(dataset = ds, exists_ok = False)
    # print(ds.full_dataset_id)
else:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{BIGQUERY_DATASET_NAME}")
    
ds

## prepare train job

In [9]:
# Dataflow's fully qualified subnetwork name, when empty the default subnetwork will be used.
dataflow_subnetwork = None 

# Specifies whether Dataflow workers use public IP addresses.
dataflow_use_public_ips = True

# NOW = datetime.datetime.now().strftime("%d %H:%M:%S.%f").replace(" ","").replace(":","_").replace(".","_")
NOW = '2106_14_34_399161' # tmp

print(NOW)

2106_14_34_399161


In [10]:
ROOT_DIR                      = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/run-{NOW}"
time_column                   = "date"
time_series_identifier_column = "store_name"
target_column                 = "sale_dollars"
data_source_csv_filenames     = None

print(f"ROOT_DIR              = {ROOT_DIR}")

ROOT_DIR              = gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/forecast-refresh-v1-v1/run-2106_14_34_399161


In [11]:
data_source_bigquery_table_path = (
    "bq://bigquery-public-data.iowa_liquor_sales_forecasting.2020_sales_train"
)

training_fraction = 0.8
validation_fraction = 0.1
test_fraction = 0.1

predefined_split_key = None
if predefined_split_key:
    training_fraction = None
    validation_fraction = None
    test_fraction = None

weight_column = None

features = [
    time_column,
    target_column,
    "city",
    "zip_code",
    "county",
]

available_at_forecast_columns = [time_column]
unavailable_at_forecast_columns = [target_column]
time_series_attribute_columns = ["city", "zip_code", "county"]

forecast_horizon = 150
context_window = 150

print(f"available_at_forecast_columns    = {available_at_forecast_columns}")
print(f"unavailable_at_forecast_columns  = {unavailable_at_forecast_columns}")
print(f"time_series_attribute_columns    = {time_series_attribute_columns}")

available_at_forecast_columns    = ['date']
unavailable_at_forecast_columns  = ['sale_dollars']
time_series_attribute_columns    = ['city', 'zip_code', 'county']


In [13]:
# transformations = helpers.generate_auto_transformation(features)
transformations = helpers.generate_transformation(auto_column_names=features)

# TRANSFORM_CONFIG_PATH = f"{ROOT_DIR}/transform_config_{NOW}.json"
TRANSFORM_CONFIG_PATH = "gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/run-28ec73a7-646e-420b-b883-2aa16ea2e518/transform_config_40ac07bd-c92b-4914-beda-18a382062acd.json"

print(f"transformations       = {transformations}\n")
print(f"TRANSFORM_CONFIG_PATH = {TRANSFORM_CONFIG_PATH}")

# helpers.write_to_gcs(TRANSFORM_CONFIG_PATH, json.dumps(transformations))

transformations       = {'auto': ['date', 'sale_dollars', 'city', 'zip_code', 'county'], 'numeric': [], 'categorical': [], 'text': [], 'timestamp': []}

TRANSFORM_CONFIG_PATH = gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/run-28ec73a7-646e-420b-b883-2aa16ea2e518/transform_config_40ac07bd-c92b-4914-beda-18a382062acd.json


In [14]:
# For a list of available model metrics, go here:
!gsutil ls $BUCKET_URI/

gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/
gs://forecast-refresh-v1-hybrid-vertex-gcs/config/


# Probabilistic Training

In [15]:
# Number of weak models in the final ensemble model.
num_selected_trials = 5
train_budget_milli_node_hours = 500  # 30 minutes

optimization_objective = "minimize-quantile-loss"

RUN_EVALUATION = False

JOB_ID = f"prob-infer-{EXPERIMENT_NAME}"
print(f"JOB_ID = {JOB_ID}")

JOB_ID = prob-infer-forecast-refresh-v1-v1


In [16]:
(
    template_path,
    parameter_values,
) = automl_forecasting_utils.get_time_series_dense_encoder_forecasting_pipeline_and_parameters(
    project=PROJECT_ID,
    location=REGION,
    root_dir=ROOT_DIR,
    target_column=target_column,
    optimization_objective=optimization_objective,
    transformations=transformations,
    train_budget_milli_node_hours=train_budget_milli_node_hours,
    data_source_csv_filenames=data_source_csv_filenames,
    data_source_bigquery_table_path=data_source_bigquery_table_path,
    weight_column=weight_column,
    predefined_split_key=predefined_split_key,
    training_fraction=training_fraction,
    validation_fraction=validation_fraction,
    test_fraction=test_fraction,
    num_selected_trials=num_selected_trials,
    time_column=time_column,
    time_series_identifier_columns=[time_series_identifier_column],
    time_series_attribute_columns=time_series_attribute_columns,
    available_at_forecast_columns=available_at_forecast_columns,
    unavailable_at_forecast_columns=unavailable_at_forecast_columns,
    forecast_horizon=forecast_horizon,
    context_window=context_window,
    dataflow_subnetwork=dataflow_subnetwork,
    dataflow_use_public_ips=dataflow_use_public_ips,
    run_evaluation=RUN_EVALUATION,                          # set True to eval on test/valid set
    # evaluated_examples_bigquery_path=f'bq://{PROJECT_ID}.eval',
    enable_probabilistic_inference=True,
    # quantile forecast
    quantiles=[0.25, 0.5, 0.9],
)

# job_id = "tide-forecasting-probabilistic-inference-{}".format(uuid.uuid4())
job = aiplatform.PipelineJob(
    display_name=JOB_ID,
    location=REGION,  # launches the pipeline job in the specified region
    template_path=template_path,
    job_id=JOB_ID,
    pipeline_root=ROOT_DIR,
    parameter_values=parameter_values,
    enable_caching=False,
)

# job.run(sync=False,experiment=EXPERIMENT_NAME)
job.submit(
    experiment=EXPERIMENT_NAME,
    # sync=False,
    service_account=VERTEX_SA,
)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/prob-infer-forecast-refresh-v1-v1
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/prob-infer-forecast-refresh-v1-v1')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/prob-infer-forecast-refresh-v1-v1?project=934903580331
Associating projects/934903580331/locations/us-central1/pipelineJobs/prob-infer-forecast-refresh-v1-v1 to Experiment: forecast-refresh-v1-v1


In [18]:
pipeline_task_details = job.task_details

for task_deets in pipeline_task_details:
    print(task_deets.task_name)

automl-tabular-finalizer
condition-5
split-materialized-data
automl-forecasting-stage-1-tuner
condition-4
set-optional-inputs
get-prediction-image-uri-2
training-configurator-and-validator
prob-infer-forecast-refresh-v1-v1
model-upload-2
calculate-training-parameters-2
feature-transform-engine
automl-forecasting-ensemble-2
condition-2
exit-handler-1
get-or-create-model-description-2
string-not-empty


# Batch Prediction job

> You can enable the batch explain feature by simply setting `generate_explanation=True` in the `batch_predict` API.


> TODO

In [43]:
# BIGQUERY_DATASET_NAME = PREFIX.replace("-","_")

# ds = bigquery.Dataset(f"{PROJECT_ID}.{PREFIX_TEMP}")
# ds.location = BQ_LOCATION
# ds = bq_client.create_dataset(dataset = ds, exists_ok = False)

# print(ds.full_dataset_id)

'forecast_refresh_v1'

### get trained model

In [30]:
stage_1_tuner_task = helpers.get_task_detail(
    pipeline_task_details, "automl-forecasting-stage-1-tuner"
)
stage_1_tuning_result_artifact_uri = (
    stage_1_tuner_task.outputs["tuning_result_output"].artifacts[0].uri
)
print(f"stage_1_tuning_result_artifact_uri: {stage_1_tuning_result_artifact_uri}")

# get uploaded model
upload_model_task = helpers.get_task_detail(
    pipeline_task_details, "model-upload-2"
)

forecasting_mp_model_artifact = (
    upload_model_task.outputs["model"].artifacts[0]
)

forecasting_mp_model = aiplatform.Model(forecasting_mp_model_artifact.metadata['resourceName'])
print(f"forecasting_mp_model: {forecasting_mp_model}")

stage_1_tuning_result_artifact_uri: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/forecast-refresh-v1-v1/run-2106_14_34_399161/934903580331/prob-infer-forecast-refresh-v1-v1/automl-forecasting-stage-1-tuner_-3853266124770639872/tuning_result_output
forecasting_mp_model: <google.cloud.aiplatform.models.Model object at 0x7f70006955a0> 
resource name: projects/934903580331/locations/us-central1/models/1896206758146211840


### confirm predict dataset.table URI

In [38]:
public_ds_name = "bigquery-public-data.iowa_liquor_sales_forecasting"

# tables = bq_client.list_tables(public_ds_name)
# # tables

# print("Tables contained in '{}':".format(public_ds_name))
# for table in tables:
#     print("{}.{}.{}".format(table.project, table.dataset_id, table.table_id))

Tables contained in 'bigquery-public-data.iowa_liquor_sales_forecasting':
bigquery-public-data.iowa_liquor_sales_forecasting.2020_sales_train
bigquery-public-data.iowa_liquor_sales_forecasting.2021_sales_predict


**bigquery_destination_prefix**
* The BigQuery URI to a project or table, up to 2000 characters long.
* when only the project is specified, the Dataset and Table is created.
* When the full table reference is specified, the Dataset *must* exist and table *must not exist*. 
* Accepted forms: 

> `bq://projectId` or `bq://projectId.bqDatasetId`

In [50]:
batch_predict_bq_output_uri_prefix = f"bq://{PROJECT_ID}.{BIGQUERY_DATASET_NAME}" #.prob_eval_{VERSION}"

PREDICTION_DATASET_BQ_PATH = (
    "bq://bigquery-public-data:iowa_liquor_sales_forecasting.2021_sales_predict"
)

print(f"Running Batch prediction for model: {forecasting_mp_model.display_name}")
print(f"batch_predict_bq_output_uri_prefix: {batch_predict_bq_output_uri_prefix}")

Running Batch prediction for model: automl-forecasting-model-upload-2748858509155106816-1911341398263595008
batch_predict_bq_output_uri_prefix: bq://hybrid-vertex.forecast_refresh_v1


### Choosing machine_type and replica count

**CPU-only machines**
* To get the best throughput, choose the smallest machine types (e.g. 2 cores, although RAM requirements vary) with as many replicas as can be kept full.
* Scaling horizontally by increasing the number of replicas improves throughput in a linear and predictable way. 
* Scaling vertically by using bigger machine types does not always improve throughput linearly.

For cost-effectiveness, choose replica count such that the batch prediction job runs for at least 10 minutes. 
* This is because you are billed per replica node hour, which includes the approximately 5 minutes it takes for each replica to start up. 
* It is not cost-effective to process for only a few seconds and then shut down.

The variables you need to calculate the number of replicas to use are as follows:

* **N**: The number of batches in the job. For example, 1 million instances / 100 batch size = 10,000 batches.
* **T**: desired time for the batch prediction job. For example, 10 minutes.
* **Tb**: time in seconds it takes for a replica to process a single batch. For example, 1 second per batch on a 2-core machine type.

Then the number of replicas is **N** / (**T** * (**60** / **Tb**)). 

> 10,000 batches / (10 minutes * (60 / 1s)) ~= 17 replicas.

See [docs](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#aiplatform_batch_predict_custom_trained-python_vertex_ai_sdk) for more details

In [51]:
MACHINE_TYPE           = "n2-standard-4"
ACCELERATOR_COUNT      = None
ACCELERATOR_TYPE       = None
STARTING_REPLICA_COUNT = 4
MAX_REPLICA_COUNT      = 12

In [52]:
batch_prediction_job = forecasting_mp_model.batch_predict(
    job_display_name=f"{PREFIX}-bpj",
    bigquery_source=PREDICTION_DATASET_BQ_PATH,
    instances_format="bigquery",
    bigquery_destination_prefix=batch_predict_bq_output_uri_prefix, # "projectId.bqDatasetId.bqTableId" (?)
    predictions_format="bigquery",
    machine_type=MACHINE_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    accelerator_type=ACCELERATOR_TYPE,
    starting_replica_count=STARTING_REPLICA_COUNT,
    max_replica_count=MAX_REPLICA_COUNT,
    generate_explanation=False,
    sync=False,
)

print(batch_prediction_job)

<google.cloud.aiplatform.jobs.BatchPredictionJob object at 0x7f6ffbb29300> is waiting for upstream dependencies to complete.
Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/4541316380896526336?project=934903580331
BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336 

In [65]:
BPJ_OUTPUT_DICT = batch_prediction_job.to_dict()

trained_forecast = aiplatform.Model(BPJ_OUTPUT_DICT['model'])

BPJ_OUTPUT_DICT

{'name': 'projects/934903580331/locations/us-central1/batchPredictionJobs/4541316380896526336', 'displayName': 'forecast-refresh-v1-bpj', 'model': 'projects/934903580331/locations/us-central1/models/1896206758146211840', 'inputConfig': {'instancesFormat': 'bigquery', 'bigquerySource': {'inputUri': 'bq://hybrid-vertex.vertex_feature_transform_engine_us.dlt_output_table_4541316380896526336'}}, 'outputConfig': {'predictionsFormat': 'bigquery', 'bigqueryDestination': {'outputUri': 'bq://hybrid-vertex.forecast_refresh_v1'}}, 'dedicatedResources': {'machineSpec': {'machineType': 'n2-standard-4'}, 'startingReplicaCount': 4, 'maxReplicaCount': 12}, 'manualBatchTuningParameters': {}, 'outputInfo': {'bigqueryOutputDataset': 'bq://hybrid-vertex.forecast_refresh_v1', 'bigqueryOutputTable': 'predictions_2023_12_21T04_39_15_619Z_334'}, 'state': 'JOB_STATE_SUCCEEDED', 'completionStats': {'successfulCount': '1721', 'successfulForecastPointCount': '5869'}, 'createTime': '2023-12-21T12:39:15.654897Z', '

In [70]:
batch_predict_bq_output_uri = "{}.{}".format(
    batch_prediction_job.output_info.bigquery_output_dataset,
    batch_prediction_job.output_info.bigquery_output_table
)

def _sanitize_bq_uri(bq_uri):
    if bq_uri.startswith("bq://"):
        bq_uri = bq_uri[5:]
    
    return bq_uri.replace(":", ".")

cleaned_bq_output_uri = _sanitize_bq_uri(
    batch_predict_bq_output_uri
)

print(batch_predict_bq_output_uri)
print(f"batch_predict_bq_output_uri : {batch_predict_bq_output_uri}")
print(f"cleaned_bq_output_uri       : {cleaned_bq_output_uri}")

bq://hybrid-vertex.forecast_refresh_v1.predictions_2023_12_21T04_39_15_619Z_334
batch_predict_bq_output_uri : bq://hybrid-vertex.forecast_refresh_v1.predictions_2023_12_21T04_39_15_619Z_334
cleaned_bq_output_uri       : hybrid-vertex.forecast_refresh_v1.predictions_2023_12_21T04_39_15_619Z_334


## View the batch prediction results

**Working with quantiles / prediction intervals**
* see this guide for details: [Example batch prediction output for a quantile-loss optimized model](https://cloud.google.com/vertex-ai/docs/tabular-data/tabular-workflows/forecasting-batch-predictions#example_batch_prediction_output_for_a_quantile-loss_optimized_model)
* `predicted_sales.quantile_values` will give the quantiles, i.e. `[0.1, 0.3, 0.5, 0.7, 0.9]`
* `predicted_sales.quantile_predictions` will be an array of the same length with matching predictions
* There is also a field `predicted_sales.value` which is just the prediction for the 0.5 quantile (median)


**Different statistics can be estimated from the quantiles, including statistics that minimize:**

* RMSE (weighted mean of quantile values)
* MAPE (median weighted by 1/value)
* MAE (median)

Use the BigQuery Python client to query the destination table and return results as a Pandas dataframe.

In [None]:
TARGET_COLUMN = "sale_dollars"

query = f"""
SELECT
 *EXCEPT(predicted_{TARGET_COLUMN}),
 predicted_{TARGET_COLUMN}.value AS predicted_sales_mean,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(0)] AS predicted_{TARGET_COLUMN}_p25,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(1)] AS predicted_{TARGET_COLUMN}_p50,
 predicted_{TARGET_COLUMN}.quantile_predictions[OFFSET(2)] AS predicted_{TARGET_COLUMN}_p90,
FROM
 `{cleaned_bq_output_uri}`
 LIMIT 100
"""
print(query)

In [None]:
# qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")
# qs_eval['predicted_sales_mean'].dtype

qs_eval = bq_client.query(query).to_dataframe()

qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")

qs_eval

# Model evaluation pipeline
* see [Model evaluation components](https://cloud.google.com/vertex-ai/docs/pipelines/model-evaluation-component#models) for details
* `model.evaluate()` - API [src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/models.py#L5143)
  * only "regression" and "classifcation" available at this time
  

**The pipeline uses the following components:**

`GetVertexModelOp`
* Gets a Vertex AI Model artifact

`EvaluationDataSamplerOp` 
* Randomly downsamples an input dataset to a specified size for computing Vertex Explainable AI feature attributions for AutoML Tabular and custom models
* Creates a Dataflow job with Apache Beam to downsample the dataset

`TargetFieldDataRemoverOp` 
* Removes the target field from the input dataset for supporting unstructured AutoML models and custom models for Vertex AI batch prediction

`ModelBatchPredictOp`
* Creates a Vertex AI batch prediction job and waits for it to complete

`ModelEvaluationFeatureAttributionOp` 
* Compute feature attribution on a trained model’s batch explanation results
* Creates a Dataflow job with Apache Beam and TFMA to compute feature attributions

`ModelImportEvaluationOp`: 
* Imports a model evaluation artifact to an existing Vertex AI model with `ModelService.ImportModelEvaluation`


`ModelEvaluationForecastingOp`
* Computes a `google.ForecastingMetrics` Artifact, containing evaluation metrics given a model's prediction results.
* Creates a Dataflow job with Apache Beam and TFMA to compute evaluation metrics.
* Supports point forecasting and quantile forecasting for tabular data.
* check here for [src code](https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/forecasting_component.py#L27https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/v1/model_evaluation/forecasting_component.py#L27)

In [84]:
import kfp

# from kfp.v2 import compiler, dsl
# from kfp.v2.dsl import (

from kfp import compiler, dsl

from kfp.dsl import (
    component, 
    pipeline, 
    Artifact, 
    # ClassificationMetrics, 
    Input, 
    Output, 
    Model, 
    Metrics
)

from typing import NamedTuple

In [82]:
EVAL_SUBDIR = "evals"
PIPELINE_TAG = 'tide-qs'
EVAL_PIPE_DIR = f"{BUCKET_URI}/automl_forecasting_pipeline/{EXPERIMENT_NAME}/{EVAL_SUBDIR}/{PIPELINE_TAG}"
print(f'EVAL_PIPE_DIR: {EVAL_PIPE_DIR}')

PIPELINE_NAME = f'eval-{PIPELINE_TAG}-{EXPERIMENT_NAME}'.replace('_', '-')
print(f"PIPELINE_NAME: {PIPELINE_NAME}")

EVAL_PIPE_DIR: gs://forecast-refresh-v1-hybrid-vertex-gcs/automl_forecasting_pipeline/forecast-refresh-v1-v1/evals/tide-quantiles
PIPELINE_NAME: eval-tide-quantiles-forecast-refresh-v1-v1


### Create custom component

In [93]:
# REPO_DOCKER_PATH_PREFIX = 'src'
!pwd

/home/jupyter/jt_repo/vertex-forecas-repo/05-tabular-workflows


In [126]:
%%writefile {REPO_DOCKER_PATH_PREFIX}/create_bq_dataset.py

import kfp
from typing import NamedTuple
from kfp.dsl import (
    # Artifact, 
    # Dataset, 
    # Input, InputPath, 
    # Model, Output, OutputPath, 
    component, 
    Metrics
)

@component(
  base_image='python:3.9',
  packages_to_install=['google-cloud-bigquery==3.14.1'],
)
def create_bq_dataset(
    project: str,
    # vertex_dataset: str,
    new_bq_dataset: str,
    bq_location: str
) -> NamedTuple('Outputs', [
    ('bq_dataset_name', str),
    ('bq_dataset_uri', str),
]):
    
    from google.cloud import bigquery

    bq_client = bigquery.Client(project=project, location='US') # bq_location)
    (
      bq_client.query(f'CREATE SCHEMA IF NOT EXISTS `{project}.{new_bq_dataset}`')
      .result()
    )
    
    return (
        f'{new_bq_dataset}',
        f'bq://{project}:{new_bq_dataset}',
    )

Overwriting src/create_bq_dataset.py


In [128]:
from src import create_bq_dataset

from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp
from google_cloud_pipeline_components.v1.model_evaluation import ModelEvaluationForecastingOp

from google_cloud_pipeline_components.preview.model_evaluation import ModelEvaluationFeatureAttributionOp

from google_cloud_pipeline_components._implementation.model import GetVertexModelOp
from google_cloud_pipeline_components._implementation.model_evaluation import (
    ModelImportEvaluationOp, 
    TargetFieldDataRemoverOp, 
    EvaluationDataSamplerOp,
)

@dsl.pipeline(
  name=PIPELINE_NAME
)
def pipeline(
    vertex_project: str,
    location: str,
    version: str,
    new_bq_dataset_name: str,
    batch_predict_machine_type: str,
    # gcs_root_dir: str,
    target_column: str,
    model_name: str,
    new_bq_dataset: str,
    batch_predict_instances_format: str,
    prediction_dataset_bq_path: str,
):
    """An eval pipeline."""

    # create BQ dataset
    create_dataset_op = (
      create_bq_dataset.create_bq_dataset(
          project=vertex_project,
          vertex_dataset="tmp",
          new_bq_dataset=new_bq_dataset_name,
          bq_location=location
      )
    )
    
    get_model_task = GetVertexModelOp(model_name=model_name)

    # ======================================
    # Model Eval Workflow
    # ======================================

    # Run Data-sampling task
    data_sampler_task = (
        EvaluationDataSamplerOp(
            project=vertex_project,
            location=location,
            # root_dir=gcs_root_dir,
            bigquery_source_uri=prediction_dataset_bq_path,
            instances_format=batch_predict_instances_format,
            sample_size=3000,
            # dataflow_subnetwork=None,
            dataflow_use_public_ips=True,
        )
    )
    
    # Run Target field-removal task
    target_remover_task = (
        TargetFieldDataRemoverOp(
            project=vertex_project,
            location=location,
            # root_dir=gcs_root_dir,
            bigquery_source_uri=data_sampler_task.outputs["bigquery_output_table"],
            instances_format=batch_predict_instances_format,
            target_field_name=target_column,
            # dataflow_subnetwork=None,
            dataflow_use_public_ips=True,
        )
    )

    # Run Batch Explanations
    batch_predict_task = (
        ModelBatchPredictOp(
            project=vertex_project,
            location=location,
            model=get_model_task.outputs['model'],
            job_display_name=f"bpj-{PIPELINE_NAME}",
            bigquery_source_input_uri=target_remover_task.outputs["bigquery_output_table"],
            instances_format=batch_predict_instances_format,
            predictions_format=batch_predict_instances_format,
            bigquery_destination_output_uri=create_dataset_op.outputs["bq_dataset_uri"], 
            machine_type=batch_predict_machine_type,
            starting_replica_count=4,
            max_replica_count=12,
            # Set the explanation parameters
            generate_explanation=False,
            # explanation_parameters=batch_predict_explanation_parameters,
            # explanation_metadata=batch_predict_explanation_metadata,
        )
    )

    # Run evaluation based on prediction type and feature attribution component.
    # After, import the model evaluations to the Vertex model.
    model_eval_task = (
        ModelEvaluationForecastingOp(
            project=vertex_project,
            location=location,
            target_field_name=target_column,
            predictions_bigquery_source=batch_predict_task.outputs["bigquery_output_table"],
            predictions_format=batch_predict_instances_format,
            model=get_model_task.outputs['model'],
            # prediction_score_column="prediction.scores",
            forecasting_type="quantile", #"point",
            forecasting_quantiles=[0.10, 0.25, 0.5, 0.75, .90],
            ground_truth_bigquery_source=data_sampler_task.outputs["bigquery_output_table"],
            ground_truth_format=batch_predict_instances_format,
        )
    )

    # Import the evaluation results to the model resource
    model_import_task = (
        ModelImportEvaluationOp(
            problem_type="forecasting",
            forecasting_metrics=model_eval_task.outputs["evaluation_metrics"],
            # feature_attributions=feature_attribution_task.outputs["feature_attributions"],
            model=get_model_task.outputs['model'],
        )
    )

In [129]:
PIPELINE_JSON_SPEC_LOCAL = "custom_pipeline_spec.json"

! rm -f $PIPELINE_JSON_SPEC_LOCAL

compiler.Compiler().compile(
    pipeline_func=pipeline, 
    package_path=PIPELINE_JSON_SPEC_LOCAL
)

In [130]:
!gsutil cp $PIPELINE_JSON_SPEC_LOCAL $EVAL_PIPE_DIR/$PIPELINE_JSON_SPEC_LOCAL

Copying file://custom_pipeline_spec.json [Content-Type=application/json]...
/ [1 files][ 69.6 KiB/ 69.6 KiB]                                                
Operation completed over 1 objects/69.6 KiB.                                     


In [132]:
job = aiplatform.PipelineJob(
    display_name=PIPELINE_NAME,
    template_path=f"{EVAL_PIPE_DIR}/{PIPELINE_JSON_SPEC_LOCAL}",
    pipeline_root=EVAL_PIPE_DIR,
    enable_caching=True,
    failure_policy='fast', # slow | fast
    parameter_values={
        'vertex_project': PROJECT_ID,
        'location': LOCATION,
        'version': VERSION,
        "batch_predict_instances_format": 'bigquery',
        "target_column": target_column,
        "model_name": BPJ_OUTPUT_DICT['model'],
        "batch_predict_machine_type": "n2-standard-4",
        # "gcs_root_dir": EVAL_PIPE_DIR,
        # "data_source_dataset": f'forecast_eval_{VERSION}_us',
        "prediction_dataset_bq_path" : PREDICTION_DATASET_BQ_PATH,
        "new_bq_dataset_name" : f"a_fresh_eval_{VERSION}_us"
    }   
)

job.run(
    sync=True,
    service_account=VERTEX_SA,
    # network=f'projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}'
)

Creating PipelineJob


InvalidArgument: 400 Some input parameters of the PipelineSpec.root are missing both defaultValue and default value from PipelineJob.runtimeConfig.parameter_values: ([new_bq_dataset])

In [120]:
PREDICTION_DATASET_BQ_PATH

'bq://bigquery-public-data:iowa_liquor_sales_forecasting.2021_sales_predict'

In [None]:
# BQ dataset for source data source
DATA_SOURCE_DATASET = f'forecast_eval_{VERSION}_us'

bigquery_source_uri = PREDICTION_DATASET_BQ_PATH

# 'bq://hybrid-vertex.forecast_refresh_v1'

"batch_predict_instances_format": 'bigquery',

parameter_values={
    'vertex_project': PROJECT_ID,
    'location': LOCATION,
    'version': VERSION,
    "batch_predict_instances_format": 'bigquery',
    "target_column": target_column,
    "model_name": BPJ_OUTPUT_DICT['model'],
    "batch_predict_machine_type": "n1-standard-4",
    "gcs_root_dir": XXXXXX,
}

In [None]:
# if RUN_EVALUATION:
#     forecast_EVALS = forecasting_mp_model.list_model_evaluations()
    
#     for model_evaluation in forecast_EVALS:
#         pprint(model_evaluation.to_dict())

In [68]:
# Get evaluations
model_evaluations = trained_forecast.list_model_evaluations()

# Print the evaluation metrics
for evaluation in model_evaluations:
    evaluation = evaluation.to_dict()
    print("Model's evaluation metrics from training:\n")
    metrics = evaluation["metrics"]
    for metric in metrics.keys():
        print(f"metric: {metric}, value: {metrics[metric]}\n")

[]

In [78]:
# qs_eval['date'] = qs_eval["date"].astype("datetime64[ns]")

qs_eval['predicted_sales_mean'].dtype

dtype('float64')

In [None]:
# View the results as a dataframe
# df_output = batch_prediction_job.iter_outputs(bq_max_results=1000).to_dataframe()

# Convert the dates to the datetime64 datatype
# df_output["date"] = df_output["date"].astype("datetime64[ns]")

# Extract the predicted sales and convert to floats
# df_output["pred_median"] = (
#     df_output["predicted_sales"].apply(lambda x: x["value"]).astype(float)
# )

# df_output.head()

### Compare predictions vs ground truth

> TODO

Plot the predicted values vs the ground truth

In [None]:
import matplotlib.pyplot as plt

# Create a shared dataframe to plot predictions vs ground truth
df_output["sales_comparison"] = df_output["predicted_sales"]
df_output["is_ground_truth"] = False
df_test_horizon_actual["sales_comparison"] = df_test_horizon_actual["sales"]
df_test_horizon_actual["is_ground_truth"] = True
df_prediction_comparison = pd.concat([df_output, df_test_horizon_actual])

# Plot sales
fig = plt.gcf()
fig.set_size_inches(24, 12)

sns.relplot(
    data=df_prediction_comparison,
    x="date",
    y="sales_comparison",
    hue="product_at_store",
    style="store",
    row="is_ground_truth",
    height=5,
    aspect=4,
    kind="line",
    ci=None,
)

# Archive

In [None]:
# trained_forecast = aiplatform.Model(
#     model_name=BPJ_OUTPUT_DICT['model']
# )
# my_evaluation_job = trained_forecast.evaluate(
#     prediction_type="classification",
#     target_field_name="type",
#     data_source_uris=["gs://sdk-model-eval/my-prediction-data.csv"],
#     staging_bucket="gs://my-staging-bucket/eval_pipeline_root",
# )
# my_evaluation_job.wait()
# my_evaluation = my_evaluation_job.get_model_evaluation()
# my_evaluation.metrics

In [None]:
# # from google_cloud_pipeline_components.aiplatform import ModelBatchPredictOp
# from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp

# from google_cloud_pipeline_components.v1.model_evaluation import ModelEvaluationForecastingOp

# from google_cloud_pipeline_components.preview.model_evaluation import ModelEvaluationFeatureAttributionOp

# from google_cloud_pipeline_components._implementation.model_evaluation import (ModelImportEvaluationOp, TargetFieldDataRemoverOp)

# preview.model_evaluation.ModelEvaluationFeatureAttributionOp
# from google_cloud_pipeline_components.experimental.evaluation import (
#     # EvaluationDataSamplerOp, 
#     # GetVertexModelOp,
#     # ModelEvaluationForecastingOp, 
#     # ModelEvaluationFeatureAttributionOp,
#     # ModelImportEvaluationOp, 
#     # TargetFieldDataRemoverOp
# )