# Training pipeline, v1

In this step, we will create a training pipeline that will result in gradually improved models over successive runs.

1. Run batch prediction with the LKG / MVP of the online model. Batch prediction will identify cells in maps contained withinan "input" a GCS bucket

  + Results are processed--cell coordinates are determined both in DnD format and in image training format.
  + Maps from input are moved into a dataset GCS bucket.
  + Contents of input maps bucket are deleted.

2. Create a new dataset from the old + new (from batch predict) data.

3. Train two new models: an online `CLOUD` model and a high-accuracy Edge model

4. Display metrics about each model.

In [1]:
! pip install google-cloud-aiplatform google-cloud-storage google-cloud-pipeline-components kfp



In [2]:
import kfp
from google.cloud import storage
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google.client import AIPlatformClient

## Create the simple image object detection pipeline

This code based upon the notebook [here](https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/official/pipelines/google-cloud-pipeline-components_automl_images.ipynb).

In [18]:
from datetime import datetime

PROJECT_ID = "fantasymaps-334622"
LOCATION = "us-central1"
GCS_BUCKET_NAME = "fantasy-maps"
GCS_TRAINING_SOURCE = "gs://fantasy-maps/Manifests/map_training_20220311212048.jsonl"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PIPELINE_ROOT = f"gs://{GCS_BUCKET_NAME}/pipeline_root"

In [26]:
@component(
    output_component_file="artifacts/metadata-from-fs.yaml",
    base_image="python:3.9",
    packages_to_install=["google-cloud-storage", "google-cloud-firestore"],
)
def create_training_metadata_from_firestore_op(
    collection_name: str,
    timestamp_str: str,
    project_id: str,
    bucket_name: str
) -> str:
    from google.cloud import firestore
    from google.cloud import storage
    
    gs_training_data_uri = f"Manifests/map_training_{timestamp_str}.jsonl"
    
    print(f"Project ID: {project_id}")
    
    firestore_client = firestore.Client(project=project_id)
    collection_ref = firestore_client.collection(collection_name)
    
    training_data = []
    
    # Get all of the non-test training data from collection
    docs = (collection_ref
                .where("source", "==", "TrainingData")
                .select(["gcsURI", "computedBBoxes"])
                #.limit(10)
                .stream())
    for doc in docs:
        doc_dict = doc.to_dict()
        
        datum = {
            "imageGcsUri": doc_dict["gcsURI"],
            "boundingBoxAnnotations": doc_dict["computedBBoxes"]
        }
        training_data.append(datum)
    
    # Collect user submitted and scraped data
    scraped_docs = (collection_ref
                .where("source", "==", "ScrapedData")
                .select(["gcsURI", "computedBBoxes"])
                .stream())
    
    for d in scraped_docs:
        doc_dict = d.to_dict()
        
        datum = {
            "imageGcsUri": doc_dict["gcsURI"],
            "boundingBoxAnnotations": doc_dict["computedBBoxes"]
        }
        training_data.append(datum)
    
    # TODO: define splits manually to verify differences between
    # enhanced and unenhanced scraped data
    
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_name)
    
    input_str = "\n".join([str(d) for d in training_data])
    file_blob = bucket.blob(gs_training_data_uri)
    file_blob.upload_from_string(input_str)
    
    full_uri = f"gs://{bucket_name}/{gs_training_data_uri}"
    
    return full_uri

## Define the pipeline

In [27]:
@kfp.dsl.pipeline(name=f"dnd-maps-training-and-deployment")
def pipeline(
    collection_name: str = "FantasyMaps",
    timestamp_str: str = TIMESTAMP,
    project: str = PROJECT_ID,
    bucket_name:str = GCS_BUCKET_NAME):
    
    manifest_uri_op = create_training_metadata_from_firestore_op(
        collection_name, 
        timestamp_str,
        project,
        bucket_name
    )
    
    manifest_uri = manifest_uri_op.output
    
    dataset_op = gcc_aip.ImageDatasetCreateOp(
        project=project,
        display_name=f"dnd-maps-max-grid-{TIMESTAMP}",
        gcs_source=manifest_uri,
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    )

    training_job_online_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-full-grid-model-online-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="CLOUD_HIGH_ACCURACY_1",
        #base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-full-grid-model-online-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=40000,
    )
    
    training_job_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-full-grid-model-exportable-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="MOBILE_TF_HIGH_ACCURACY_1",
        #base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-full-grid-model-exportable-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=90000, # 100000 is the maximum for an exportable TF model
    )
    
    endpoint_op = gcc_aip.ModelDeployOp(
        #project=project,
        model=training_job_online_run_op.outputs["model"]
    )

## Compile the pipeline and run it

In [28]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="artifacts/dnd_maps_pipeline.json"
)

In [29]:
api_client = AIPlatformClient(project_id=PROJECT_ID, region=LOCATION)

In [30]:
response = api_client.create_run_from_job_spec(
    "artifacts/dnd_maps_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"project": PROJECT_ID},
)