# Training pipeline, v1

In this step, we will create a training pipeline that will result in gradually improved models over successive runs.

1. Run batch prediction with the LKG / MVP of the online model. Batch prediction will identify cells in maps contained withinan "input" a GCS bucket

  + Results are processed--cell coordinates are determined both in DnD format and in image training format.
  + Maps from input are moved into a dataset GCS bucket.
  + Contents of input maps bucket are deleted.

2. Create a new dataset from the old + new (from batch predict) data.

3. Train two new models: an online `CLOUD` model and a high-accuracy Edge model

4. Display metrics about each model.

In [None]:
import kfp
from google.cloud import storage
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient

## Create the simple image object detection pipeline

This code based upon the notebook [here](https://github.com/GoogleCloudPlatform/ai-platform-samples/blob/master/ai-platform-unified/notebooks/official/pipelines/google-cloud-pipeline-components_automl_images.ipynb).

In [None]:
from datetime import datetime

PROJECT_ID = "fantasymaps-334622"
LOCATION = "us-central1"
GCS_BUCKET_NAME = "fantasy-maps"
GCS_TRAINING_SOURCE = "gs://fantasy-maps/map_training_data.jsonl"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PIPELINE_ROOT = f"gs://{GCS_BUCKET_NAME}/pipeline_root"

## Define the pipeline

In [None]:
@kfp.dsl.pipeline(name=f"dnd-maps-training-and-deployment")
def pipeline(project: str = PROJECT_ID):
    dataset_op = gcc_aip.ImageDatasetCreateOp(
        project=project,
        display_name=f"dnd-maps-max-grid-{TIMESTAMP}",
        gcs_source=GCS_TRAINING_SOURCE,
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    )

    training_job_online_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-full-grid-model-online-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="CLOUD_HIGH_ACCURACY_1",
        base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-full-grid-model-online-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=40000,
    )
    
    training_job_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project,
        display_name=f"train-dnd-maps-full-grid-model-exportable-{TIMESTAMP}",
        prediction_type="object_detection",
        model_type="MOBILE_TF_HIGH_ACCURACY_1",
        base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"dnd-maps-full-grid-model-exportable-{TIMESTAMP}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=90000, # 100000 is the maximum for an exportable TF model
    )
    
    endpoint_op = gcc_aip.ModelDeployOp(
        project=project, model=training_job_online_run_op.outputs["model"]
    )

## Compile the pipeline and run it

In [None]:
from kfp.v2 import compiler  # noqa: F811

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="dnd_maps_pipeline.json"
)



In [7]:
from kfp.v2.google.client import AIPlatformClient  # noqa: F811

api_client = AIPlatformClient(project_id=PROJECT_ID, region=LOCATION)



In [13]:
response = api_client.create_run_from_job_spec(
    "dnd_maps_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={"project": PROJECT_ID},
)

## Create the simple batch prediction component

Start with a simple, one-component batch prediction pipeline. First we need to create a batch prediction input file.

In [146]:
from google.cloud import storage

storage_client = storage.Client(project=PROJECT_ID)

bp_bucket_name, bp_prefix_name = batch_prediction_gcs_source.split("/")[0:2]

print(bp_bucket_name)
print(bp_prefix_name)

# List all of the maps in the batch prediction bucket
maps_bucket = storage_client.bucket(bp_bucket_name)

print(maps_bucket)
items = maps_bucket.list_blobs()

maps_in_bp_bucket = []

for item in items:
  if item.name.find(bp_prefix_name) > -1 and item.name.find('jpg') > -1:
    print(item)
    maps_in_bp_bucket.append(item.name)

# Need to create the batch prediction input file


video-erschmid
DnD-batch-predict-input
<Bucket: video-erschmid>
<Blob: video-erschmid, DnD-batch-predict-input/GRIDDED-The-Potato-Thief-Lovers-social.jpg, 1627076479591472>
<Blob: video-erschmid, DnD-batch-predict-input/VTT-runeport-docks-GRID-social.jpg, 1627076480969298>
<Blob: video-erschmid, DnD-batch-predict-input/forest-river-social-2.jpg, 1627076480173411>
<Blob: video-erschmid, DnD-batch-predict-input/forest-tracks.jpg, 1630445535492248>
<Blob: video-erschmid, DnD-batch-predict-input/hard-landing.jpg, 1630445447750400>
<Blob: video-erschmid, DnD-batch-predict-input/small-cemetary.jpg, 1630445606511548>
<Blob: video-erschmid, DnD-batch-predict-input/social-SNOW-FOREST-ROAD.jpg, 1627076480234330>


In [None]:
@kfp.dsl.pipeline(name="batch-prediction-data")
def pipeline(project: str = PROJECT_ID, model: str = model_id):

    aiplatform.init(PROJECT_ID, location)
    model = aiplatform.Model(f"projects/{PROJECT_ID}/locations/{location}/models/{model_id}")

    batch_predict_op = gcc_aip.ModelBatchPredictOp(
        project=project,
        job_display_name="batch-prediction-data-",
        gcs_source=batch_prediction_gcs_source,
        ##
    )

