# Complete Fantasy Maps pipeline


The completed, self-bootstrapping FantasyMaps pipeline should look something like the following:

1.  Scrape images from Reddit (and Pinterest [p2], Quora [p2])
    *   Inputs: The sources to scrape maps from
    *   Outputs: Array of image URLs; Saved image files in GCS (implicit)
1.  Send new, scraped images out for batch prediction
    *   Inputs: Array of image URLs, current model resource name
    *   Outputs: URI of JSONL of batch prediction results
1.  Upload batch prediction results to Firestore
    *   Inputs: URI of JSONL of batch prediction results
    *   Outputs: None
1.  Train new online & exportable model
    *   Inputs: Firestore collection name with training data
    *   Outputs: Online model and exportable model
1.  Deploy online model (not necessary for batch prediction)
    *   Inputs: Online model resource name
    *   Outputs: Endpoint resource name for deployed model


## Data states

1.  Initial image data
    *  Image URI (not GCS)
    *  VTT grid data (for images provided by contributor)
1.  Prediction data
    *  URI on GCS
    *  Predicted bounding boxes
1.  Known image
    *  URI on GCS
    *  Identified VTT coordinates
1.  Training data
    *  URI on GCS
    *  Identified VTT coordinates & bounding boxes

Scraped data flow:
Initial => predicted => known => training data

Contributed data flow:
Initial => known => training data

In [1]:
! pip install google-cloud-aiplatform google-cloud-firestore google-cloud-storage kfp google-cloud-pipeline-components



In [7]:
import kfp
from google.cloud import storage
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp import dsl
from kfp.v2.dsl import component
from kfp.v2.google.client import AIPlatformClient
from typing import NamedTuple

In [106]:
from datetime import datetime

PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
print(PROJECT_ID)

LOCATION = "us-central1"
GCS_BUCKET_NAME = "fantasy-maps"
GCS_TRAINING_SOURCE = "gs://fantasy-maps/map_training_data.jsonl"
PIPELINE_ROOT = f"gs://{GCS_BUCKET_NAME}/pipeline_root"

fantasymaps-334622


## Create training metadata component

Demonstration code from [here](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/pipelines_intro_kfp.ipynb).

In [96]:
aiplatform.init(project=PROJECT_ID, staging_bucket=f"gs://{GCS_BUCKET_NAME}")

In [110]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
print(TIMESTAMP)

@component(
    output_component_file="metadata-from-fs.yaml",
    base_image="python:3.9",
    packages_to_install=["google-cloud-storage", "google-cloud-firestore"],
)
def create_training_metadata_from_firestore_op(
    collection_name: str,
    timestamp_str: str,
    project_id: str,
    bucket_name: str
) -> str:
    from google.cloud import firestore
    from google.cloud import storage
    
    gs_training_data_uri = f"Manifests/map_training_{timestamp_str}.jsonl"
    
    print(f"Project ID: {project_id}")
    
    firestore_client = firestore.Client(project=project_id)
    collection_ref = firestore_client.collection(collection_name)
    
    training_data = []
    
    # Get all of the non-test training data from collection
    docs = (collection_ref
                .where("source", "==", "TrainingData")
                .select(["gcsURI", "computedBBoxes"])
                #.limit(10)
                .stream())
    for doc in docs:
        doc_dict = doc.to_dict()
        
        datum = {
            "imageGcsUri": doc_dict["gcsURI"],
            "boundingBoxAnnotations": doc_dict["computedBBoxes"]
        }
        training_data.append(datum)
    
    # TODO: collect user submitted and scraped data
    
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(bucket_name)
    
    input_str = "\n".join([str(d) for d in training_data])
    file_blob = bucket.blob(gs_training_data_uri)
    file_blob.upload_from_string(input_str)
    
    full_uri = f"gs://{bucket_name}/{gs_training_data_uri}"
    
    return full_uri

20220401222313


In [107]:
@dsl.pipeline(
    name="training-from-firestore",
    description="Converts collection data from Firestore into training data",
    pipeline_root=PIPELINE_ROOT,
)
def pipeline(
    collection_name: str = "FantasyMaps",
    timestamp_str: str = TIMESTAMP,
    project_id: str = PROJECT_ID,
    bucket_name:str = GCS_BUCKET_NAME
):
    """
    manifest_uri_op = create_training_metadata_from_firestore_op(
        collection_name, 
        timestamp_str,
        project_id,
        bucket_name
    )
    
    manifest_uri = manifest_uri_op.output
    """
    
    manifest_uri = "gs://fantasy-maps/Manifests/map_training_20220401212024.jsonl"
    project_id = "fantasymaps-334622"
    
    dataset_op = gcc_aip.ImageDatasetCreateOp(
        project=project_id,
        display_name=f"fantasy-maps-max-grid-{timestamp_str}",
        gcs_source=manifest_uri,
        import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    )
    

    training_job_online_run_op = gcc_aip.AutoMLImageTrainingJobRunOp(
        project=project_id,
        display_name=f"train-fantasy-maps-online-model-{timestamp_str}",
        prediction_type="object_detection",
        model_type="CLOUD_HIGH_ACCURACY_1",
        base_model=None,
        dataset=dataset_op.outputs["dataset"],
        model_display_name=f"fantasy-maps-online-model-{timestamp_str}",
        training_fraction_split=0.7,
        validation_fraction_split=0.2,
        test_fraction_split=0.1,
        budget_milli_node_hours=40000,
    )
        

In [108]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="firestore_data.json"
)

In [109]:
DISPLAY_NAME = "firestore_dataset_" + TIMESTAMP

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="firestore_data.json",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/733537716875/locations/us-central1/pipelineJobs/training-from-firestore-20220401220220
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/733537716875/locations/us-central1/pipelineJobs/training-from-firestore-20220401220220')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/training-from-firestore-20220401220220?project=733537716875
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/733537716875/locations/us-central1/pipelineJobs/training-from-firestore-20220401220220 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/733537716875/location

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [automlimagetrainingjob-run].; Job (project_id = fantasymaps-334622, job_id = 1008133415414792192) is failed due to the above error.; Failed to handle the job: {project_number = 733537716875, job_id = 1008133415414792192}"


## Debug the pipeline code

In [46]:
project_id = PROJECT_ID
collection_name = "FantasyMaps"
bucket_name = GCS_BUCKET_NAME
timestamp_str = TIMESTAMP


from google.cloud import firestore
from google.cloud import storage

gs_training_data_uri = f"Manifests/map_training_{timestamp_str}.jsonl"

firestore_client = firestore.Client(project=project_id)
collection_ref = firestore_client.collection(collection_name)

training_data = []

# Get all of the non-test training data from collection
docs = (collection_ref
        .where("source", "==", "TrainingData")
        .select(["gcsURI", "computedBBoxes"])
        .stream())
for doc in docs:
    doc_dict = doc.to_dict()
    
    datum = {
        "imageGcsUri": doc_dict["gcsURI"],
        "boundingBoxAnnotations": doc_dict["computedBBoxes"]
    }
    training_data.append(datum)

storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(bucket_name)

input_str = "\n".join([str(d) for d in training_data])
file_blob = bucket.blob(gs_training_data_uri)
file_blob.upload_from_string(input_str)
