# [title] (TODO)

## Prerequisites

1. Please ensure that you have a Document AI Warehouse instance in your project. You can follow [this quickstart](https://cloud.google.com/document-warehouse/docs/quickstart) to complete the setup.
2. Create a Document AI [Invoice processor](https://cloud.google.com/document-ai/docs/processors-list#processor_invoice-processor) and update the `DOCAI_PROCESSOR_ID` variable below.
3. If you are using a Vertex AI Workbench Managed Notebook, ensure to grant the following roles:
> * [roles/contentwarehouse.documentAdmin](https://cloud.google.com/document-warehouse/docs/manage-access-control)
> * [roles/documentai.apiUser](https://cloud.google.com/document-ai/docs/access-control/iam-roles)

## env setup

In [1]:
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

# locations / regions for cloud resources
LOCATION                 = 'us-central1'     # TODO
REGION                   = LOCATION          # TODO
BQ_LOCATION              = 'US'              # TODO
DOC_AI_LOCATION          = "us"              # Format is "us" or "eu"

# VPC network (TODO: public endpoints)
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network" # TODO

print(f"PROJECT_ID       : {PROJECT_ID}")
print(f"PROJECT_NUM      : {PROJECT_NUM}")
print(f"LOCATION         : {LOCATION}")
print(f"REGION           : {REGION}")
print(f"BQ_LOCATION      : {BQ_LOCATION}")
print(f"DOC_AI_LOCATION  : {DOC_AI_LOCATION}")
print(f"VPC_NETWORK_NAME : {VPC_NETWORK_NAME}")

PROJECT_ID       : hybrid-vertex
PROJECT_NUM      : 934903580331
LOCATION         : us-central1
REGION           : us-central1
BQ_LOCATION      : US
DOC_AI_LOCATION  : us
VPC_NETWORK_NAME : ucaip-haystack-vpc-network


In [2]:
import pandas as pd
import numpy as np

from google.cloud import aiplatform as vertex_ai
from google.cloud import storage
from google.cloud import bigquery

In [3]:
# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)

# Vertex client
vertex_ai.init(project=PROJECT_ID, location=LOCATION)

# bigquery client
bqclient = bigquery.Client(
    project=PROJECT_ID,
    # location=LOCATION
)

In [4]:
# create new BQ datasets, tables, etc.?
CREATE_NEW_ASSETS         = True            # TODO: True | False

USE_CASE                  = 'invoice-recon'  # TODO
ACTOR_PREFIX              = 'jts'            # TODO
VERSION                   = 'v1'             # TODO

print(f"CREATE_NEW_ASSETS : {CREATE_NEW_ASSETS}")
print(f"ACTOR_PREFIX      : {ACTOR_PREFIX}")
print(f"VERSION           : {VERSION}")
print(f"USE_CASE          : {USE_CASE}")

CREATE_NEW_ASSETS : True
ACTOR_PREFIX      : jts
VERSION           : v1
USE_CASE          : invoice-recon


In [5]:
BUCKET_NAME              = f'a-{ACTOR_PREFIX}-{VERSION}-{USE_CASE}'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

# bigquery dataset
MY_BQ_DATASET            = BUCKET_NAME.lower().replace(PROJECT_ID,"").replace("-","_").rstrip("_")

# vpc network
VPC_NETWORK_FULL         = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

print(f"BUCKET_NAME        : {BUCKET_NAME}")
print(f"BUCKET_URI         : {BUCKET_URI}")
print(f"MY_BQ_DATASET      : {MY_BQ_DATASET}")
print(f"VPC_NETWORK_FULL   : {VPC_NETWORK_FULL}")

BUCKET_NAME        : a-jts-v1-invoice-recon
BUCKET_URI         : gs://a-jts-v1-invoice-recon
MY_BQ_DATASET      : a_jts_v1_invoice_recon
VPC_NETWORK_FULL   : projects/934903580331/global/networks/ucaip-haystack-vpc-network


In [6]:
if CREATE_NEW_ASSETS:
    ! gsutil mb -l $LOCATION $BUCKET_URI

Creating gs://a-jts-v1-invoice-recon/...


In [7]:
!gsutil ls $BUCKET_URI

In [8]:
if CREATE_NEW_ASSETS:
    ds = bigquery.Dataset(f"{PROJECT_ID}.{MY_BQ_DATASET}")
    ds.location = 'us' #Multi-region is REGION[0:2]
    ds = bqclient.create_dataset(dataset = ds, exists_ok = False)

    print(ds.full_dataset_id)

hybrid-vertex:a_jts_v1_invoice_recon


In [10]:
config = f"""
PROJECT_ID               = \"{PROJECT_ID}\"
PROJECT_NUM              = \"{PROJECT_NUM}\"
LOCATION                 = \"{LOCATION}\"

REGION                   = \"{REGION}\"
BQ_LOCATION              = \"{BQ_LOCATION}\"
DOC_AI_LOCATION          = \"{DOC_AI_LOCATION}\"
VPC_NETWORK_NAME         = \"{VPC_NETWORK_NAME}\"

ACTOR_PREFIX             = \"{ACTOR_PREFIX}\"
VERSION                  = \"{VERSION}\"

BUCKET_NAME              = \"{BUCKET_NAME}\"
BUCKET_URI               = \"{BUCKET_URI}\"

VPC_NETWORK_FULL         = \"{VPC_NETWORK_FULL}\"

MY_BQ_DATASET            = \"{MY_BQ_DATASET}\"
"""
print(config)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
DOC_AI_LOCATION          = "us"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

ACTOR_PREFIX             = "jts"
VERSION                  = "v1"

BUCKET_NAME              = "a-jts-v1-invoice-recon"
BUCKET_URI               = "gs://a-jts-v1-invoice-recon"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

MY_BQ_DATASET            = "a_jts_v1_invoice_recon"



In [11]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copying from <STDIN>...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              


In [12]:
!gsutil ls $BUCKET_URI

gs://a-jts-v1-invoice-recon/config/


## Enable APIs

In [None]:
# DocAI
!gcloud services enable documentai.googleapis.com

# Vertex AI 
!gcloud services enable aiplatform.googleapis.com

### do these in notebook terminal

In [None]:
# pip install --upgrade google-cloud-documentai-toolbox --user

# should install
# google-cloud-documentai==2.18.0
# google-cloud-documentai-toolbox==0.10.0a0

In [None]:
# pip install google-cloud-contentwarehouse

In [None]:
# pip install PyPDF2

## git ignore

In [13]:
%%writefile .gitignore
*.cpython-310.pyc
*checkpoint.ipynb
*.ipynb_checkpoints/*
*__pycache__
# *cpython-37.pyc
# .gitignore
# .DS_Store
*jt-wip*

Writing .gitignore


## gcloudignore

In [17]:
! gcloud config set gcloudignore/enabled true

Updated property [gcloudignore/enabled].


In [None]:
# %%writefile .gcloudignore
# .gcloudignore
# *.pkl
# *.png
# *.pdf
# *.jpg
# *.ipynb
# .git
# .github
# .ipynb_checkpoints/*
# *__pycache__
# *cpython-37.pyc