In [None]:
! pip install --upgrade -r 'requirements.txt' --user

In [None]:
import os 

if not os.getenv("IS_TESTING"):
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
import os
import sys

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

In [2]:
config_path = os.path.join(os.path.dirname(os.getcwd()), "src")

In [3]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = !gcloud config get-value project
PROJECT_NUMS = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUMS[0]
GOOGLE_CLOUD_PROJECT = GCP_PROJECTS[0]
GOOGLE_CLOUD_LOCATION = "us-central1"
GOOGLE_CLOUD_LOCATION_MULTI_REGION = "us"
GOOGLE_CLOUD_GCS_BUCKET = f"{GOOGLE_CLOUD_PROJECT}-bucket"
GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION = f"{GOOGLE_CLOUD_PROJECT}-bucket-multi-us"
GOOGLE_CLOUD_SERVICE_ACCOUNT = f"{PROJECT_NUM}-compute@developer.gserviceaccount.com"
GOOGLE_GEMINI_MODEL_15 = "gemini-1.5-pro"
GOOGLE_GEMINI_MODEL_10 = "gemini-1.0-pro"
GOOGLE_CLOUD_BIGQUERY_PROJECT = GOOGLE_CLOUD_PROJECT
GOOGLE_CLOUD_BIGQUERY_DATASET = "rca_data"
GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION = "rca_data_us"
BASE_TABLE_NAME_EVENTS = "telco_rca_events"
BASE_TABLE_NAME_INCIDENTS = "telco_rca_incidents"

In [None]:
client = documentai.DocumentProcessorServiceClient(
    client_options=ClientOptions(
        api_endpoint=f"{GOOGLE_CLOUD_LOCATION_MULTI_REGION}-documentai.googleapis.com"
    )
)

processor = client.create_processor(
    parent=client.common_location_path(GOOGLE_CLOUD_PROJECT, GOOGLE_CLOUD_LOCATION_MULTI_REGION),
    processor=documentai.Processor(display_name="rca_processor", type_="OCR_PROCESSOR"))


DOC_AI_PROCESSOR_URI = processor.name

In [4]:
config = f"""[GCP]
GOOGLE_CLOUD_PROJECT = \"{GOOGLE_CLOUD_PROJECT}\"
GOOGLE_CLOUD_LOCATION = \"{GOOGLE_CLOUD_LOCATION}\"
GOOGLE_CLOUD_LOCATION_MULTI_REGION = \"{GOOGLE_CLOUD_LOCATION_MULTI_REGION}\"
GOOGLE_CLOUD_GCS_BUCKET = \"{GOOGLE_CLOUD_GCS_BUCKET}\"
GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION = \"{GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION}\"
GOOGLE_CLOUD_SERVICE_ACCOUNT = \"{GOOGLE_CLOUD_SERVICE_ACCOUNT}\"
[VERTEX]
GOOGLE_GEMINI_MODEL_15 = \"{GOOGLE_GEMINI_MODEL_15}\"
GOOGLE_GEMINI_MODEL_10 = \"{GOOGLE_GEMINI_MODEL_10}\"
[BIGQUERY]
GOOGLE_CLOUD_BIGQUERY_PROJECT = \"{GOOGLE_CLOUD_BIGQUERY_PROJECT}\"
GOOGLE_CLOUD_BIGQUERY_DATASET = \"{GOOGLE_CLOUD_BIGQUERY_DATASET}\"
GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION = \"{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}\"
BASE_TABLE_NAME_EVENTS = \"{BASE_TABLE_NAME_EVENTS}\"
BASE_TABLE_NAME_INCIDENTS = \"{BASE_TABLE_NAME_INCIDENTS}\"
[DOC_AI]
DOC_AI_PROCESSOR_URI = \"{DOC_AI_PROCESSOR_URI}\""""

In [5]:
!echo '{config}' > '{config_path}'/config.toml

In [None]:
! gsutil mb -l {GOOGLE_CLOUD_LOCATION} gs://{GOOGLE_CLOUD_GCS_BUCKET}
! gsutil mb -l {GOOGLE_CLOUD_LOCATION_MULTI_REGION} gs://{GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION}
! bq --location={GOOGLE_CLOUD_LOCATION} mk --dataset {GOOGLE_CLOUD_BIGQUERY_PROJECT}:{GOOGLE_CLOUD_BIGQUERY_DATASET}
! bq --location={GOOGLE_CLOUD_LOCATION_MULTI_REGION} mk --dataset {GOOGLE_CLOUD_BIGQUERY_PROJECT}:{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}

In [None]:
! gcloud storage cp gs://telco-rca-lab-public/telco_rca_events.csv .
! gcloud storage cp gs://telco-rca-lab-public/telco_rca_incidents.csv .

In [None]:
! bq --location={GOOGLE_CLOUD_LOCATION} load --autodetect --source_format=CSV {GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_EVENTS} telco_rca_events.csv
! bq --location={GOOGLE_CLOUD_LOCATION} load --autodetect --field_delimiter='|' --source_format=CSV {GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_INCIDENTS} telco_rca_incidents.csv

In [None]:
!gcloud storage cp gs://telco-rca-lab-public/docs/*.pdf gs://{GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION}/rca/

In [None]:
! bq mk --connection --location={GOOGLE_CLOUD_LOCATION_MULTI_REGION} --project_id={GOOGLE_CLOUD_BIGQUERY_PROJECT} --connection_type=CLOUD_RESOURCE genai
CONNECTION_SAS= !bq show --format=prettyjson --connection {GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_LOCATION_MULTI_REGION}.genai | jq -r ."cloudResource"."serviceAccountId"
CONNECTION_SA = CONNECTION_SAS[0]
! gcloud projects add-iam-policy-binding '{PROJECT_NUM}' --member='serviceAccount:{CONNECTION_SA}' --role='roles/aiplatform.user' --condition=None
! gcloud projects add-iam-policy-binding '{PROJECT_NUM}' --member='serviceAccount:{CONNECTION_SA}' --role='roles/documentai.viewer' --condition=None
! gcloud projects add-iam-policy-binding '{PROJECT_NUM}' --member='serviceAccount:{CONNECTION_SA}' --role='roles/storage.objectViewer' --condition=None