In [1]:
#@title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Training pipeline, v2

In this step, we will create a training pipeline that will result in gradually improved models over successive runs.

1. Create a new dataset from the old + new (from batch predict) data.
   + This training data contains bounding boxes on vertical & horizontal lines rather than cells
1. Train two new models: an online `CLOUD` model and a high-accuracy Edge model
1. Display metrics about each model.

In [1]:
! pip install google-cloud-aiplatform google-cloud-storage google-cloud-pipeline-components kfp



In [3]:
import kfp
from google.cloud import storage
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google.client import AIPlatformClient

## Create the simple image object detection pipeline

This code based upon the notebook [here](https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/official/pipelines/google-cloud-pipeline-components_automl_images.ipynb).

In [6]:
from datetime import datetime

PROJECT_ID = "fantasymaps-334622"
LOCATION = "us-central1"
GCS_BUCKET_NAME = "fantasy-maps"
GCS_TRAINING_SOURCE = "gs://fantasy-maps/Manifests/map_training_20220311212048.jsonl"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PIPELINE_ROOT = f"gs://{GCS_BUCKET_NAME}/pipeline_root"

In [7]:
@component(
    output_component_file="artifacts/metadata-from-fs.yaml",
    base_image="python:3.9",
    packages_to_install=["google-cloud-storage", "google-cloud-firestore"],
)
def create_training_metadata_from_firestore_op(
    collection_name: str,
    timestamp_str: str,
    project_id: str,
    bucket_name: str
) -> str:

    def vtt_to_bboxes(vtt):
        cell_offset_x = vtt["cellOffsetX"]
        cell_offset_y = vtt["cellOffsetY"]
        cell_height = vtt["cellHeight"]
        cell_width = vtt["cellWidth"]
        image_height = vtt["imageHeight"]
        image_width = vtt["imageWidth"]
        
        """Output format needs to be:
        {
            "displayName": "",
            "xMax": #.#,
            "xMin": #.#,
            "yMax": #.#,
            "yMin": #.#
        }
        """
        
        # Generate vertical bboxes
        curr_x = cell_offset_x
        bboxes_v = []
        while curr_x < image_width:
            curr_x = curr_x + cell_width
            l_side = curr_x - 1
            r_side = curr_x + 1
            
            bboxes_v.append({
                "displayName": "vline",
                "xMin": l_side / image_width,
                "xMax": r_side / image_width,
                "yMin": 0.0,
                "yMax": 1.0
            })
        
        # Generate horizontal bboxes
        curr_y = cell_offset_y
        bboxes_h = []
        while curr_y < image_height:
            curr_y = curr_y + cell_height
            t_side = curr_y - 1
            b_side = curr_y + 1
            
            bboxes_h.append({
                "displayName": "hline",
                "xMin": 0.0,
                "xMax": 1.0,
                "yMin": t_side / image_height,
                "yMax": b_side / image_height 
            })
            
        bboxes_v.extend(bboxes_h)
        return bboxes_v
    
    from google.cloud import firestore
    from google.cloud import storage
    
    gs_training_data_uri = f"Manifests/map_training_{timestamp_str}.jsonl"
    
    print(f"Project ID: {project_id}")
    
    firestore_client = firestore.Client(project=project_id)
    collection_ref = firestore_client.collection(collection_name)
    
    training_data = []
    
    # Get all of the non-test training data from collection
    docs = (collection_ref
                .where("source", "==", "TrainingData")
                .select(["gcsURI", "vtt"])
                .stream())
    for doc in docs:
        doc_dict = doc.to_dict()
        
        bboxes_lines = vtt_to_bboxes(doc_dict["vtt"])
        
        datum = {
            "imageGcsUri": doc_dict["gcsURI"],
            "boundingBoxAnnotations": bboxes_lines
        }
        training_data.append(datum)
    
    # TODO: define splits manually to verify differences between
    # enhanced and unenhanced scraped data
    
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_name)
    
    input_str = "\n".join([str(d) for d in training_data])
    file_blob = bucket.blob(gs_training_data_uri)
    file_blob.upload_from_string(input_str)
    
    full_uri = f"gs://{bucket_name}/{gs_training_data_uri}"
    
    return full_uri

## Define the pipeline

In [8]:
@kfp.dsl.pipeline(name=f"dnd-maps-training-and-deployment")
def pipeline(
    collection_name: str = "FantasyMaps",
    timestamp_str: str = TIMESTAMP,
    project: str = PROJECT_ID,
    bucket_name:str = GCS_BUCKET_NAME):
    
    manifest_uri_op = create_training_metadata_from_firestore_op(
        collection_name, 
        timestamp_str,
        project,
        bucket_name
    )
    manifest_uri_op.set_caching_options(False)
    
    manifest_uri = manifest_uri_op.output
    
    dataset_op = gcc_aip.ImageDatasetCreateOp(
        project=project,
        display_name=f"dnd-maps-lines-{TIMESTAMP}",
        gcs_source=manifest_uri,
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    )

    training_job_online_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-lines-model-online-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="CLOUD_HIGH_ACCURACY_1",
        #base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-lines-model-online-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=40000,
    )
    
    training_job_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-lines-model-exportable-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="MOBILE_TF_HIGH_ACCURACY_1",
        #base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-lines-model-exportable-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=90000, # 100000 is the maximum for an exportable TF model
    )
    
    endpoint_op = gcc_aip.ModelDeployOp(
        #project=project,
        model=training_job_online_run_op.outputs["model"]
    )

## Compile the pipeline and run it

In [9]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="artifacts/dnd_maps_lines_pipeline.json"
)



In [10]:
api_client = AIPlatformClient(project_id=PROJECT_ID, region=LOCATION)



In [11]:
response = api_client.create_run_from_job_spec(
    "artifacts/dnd_maps_lines_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"project": PROJECT_ID},
    enable_caching=False,
)

In [49]:
help(api_client.create_run_from_job_spec)

Help on method create_run_from_job_spec in module kfp.v2.google.client.client:

create_run_from_job_spec(job_spec_path: str, job_id: Union[str, NoneType] = None, pipeline_root: Union[str, NoneType] = None, parameter_values: Union[Mapping[str, Any], NoneType] = None, enable_caching: Union[bool, NoneType] = None, cmek: Union[str, NoneType] = None, service_account: Union[str, NoneType] = None, network: Union[str, NoneType] = None, labels: Union[Mapping[str, str], NoneType] = None) -> dict method of kfp.v2.google.client.client.AIPlatformClient instance
    Runs a pre-compiled pipeline job on AIPlatformPipelines service.
    
    Args:
      job_spec_path: The path of PipelineJob JSON file. It can be a local path
        or a GS URI.
      job_id: Optionally, the user can provide the unique ID of the job run. If
        not specified, pipeline name + timestamp will be used.
      pipeline_root: Optionally the user can override the pipeline root
        specified during the compile time.
   