## Set up authentication for accessing Google Cloud Platform (GCP)

In [2]:
import os
import sys
def set_gcp_credentials_from_file(credential_path):
    if not os.path.exists(credential_path):
        raise FileNotFoundError(f"Service account key file not found at: {credential_path}")
    
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_path
    print("GCP credentials have been set successfully.")

# Path to your service account key file
GOOGLE_APPLICATION_CREDENTIALS_PATH = "/Users/zacharynguyen/Documents/GitHub/2024/Applied-Generative-AI/IAM/zacharynguyen-genai-656c475b142a.json"

try:
    set_gcp_credentials_from_file(GOOGLE_APPLICATION_CREDENTIALS_PATH)
except FileNotFoundError as e:
    print(e)
    sys.exit(1)  # Exit the script with an error code if the file is not found


GCP credentials have been set successfully.


## Install Libraries and enable APIs

In [3]:
#!pip install -r requirements.txt 
#!gcloud auth login

In [4]:
#!gcloud services enable aiplatform.googleapis.com
#!gcloud services enable documentai.googleapis.com

## Setup Variables

In [5]:
# Configuration for Project Environment and Data Handling

# Project and Data Analysis Settings
PROJECT_ID = 'zacharynguyen-genai'
REGION = 'us-central1'
EXPERIMENT = 'cigna-handbook'
SERIES = 'applied-genai-v3'

# Data Storage and Retrieval Configuration
SAVE_IN = 'ALL'  # Options: GCS, BQ, ALL
RETRIEVE_FROM = 'GCS'  # Options: GCS, BQ. Default action is to parse and embed if not present.

# Google Cloud Storage (GCS) Setup
GCS_BUCKET = PROJECT_ID  # Naming the bucket after the project ID for consistency

# BigQuery (BQ) Setup for Storing Results
BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-', '_')  # Formatting to comply with BQ naming conventions
BQ_TABLE = EXPERIMENT
BQ_REGION = REGION[:2]  # Simplified regional code derived from the full region string

# Document Source Configuration
# Specify the locations of source documents to be processed
source_documents = [
    'https://www.tn.gov/content/dam/tn/partnersforhealth/documents/cigna_member_handbook_2024.pdf'
]

# Prior Run Handling
# Determines whether to use data from a previous run based on the USE_PRIOR_RUN flag
USE_PRIOR_RUN = True  # Boolean flag to indicate preference for reusing previous data when available

# Initial Analysis Query
# Defining the first question to guide the analysis or processing of the document
question = "What situations are considered emergencies and covered if I visit the ER?"

# Note: Ensure that environment variables and necessary permissions are correctly set for GCS and BQ access.


In [7]:
def print_configuration():
    configurations = {
        "Project ID": PROJECT_ID,
        "Region": REGION,
        "Experiment": EXPERIMENT,
        "Series": SERIES,
        "Data Storage Option": SAVE_IN,
        "Data Retrieval Source": RETRIEVE_FROM,
        "GCS Bucket Name": GCS_BUCKET,
        "BigQuery Project": BQ_PROJECT,
        "BigQuery Dataset": BQ_DATASET,
        "BigQuery Table": BQ_TABLE,
        "BigQuery Region": BQ_REGION,
        "Source Document(s)": ', '.join(source_documents),
        "Use Prior Run": USE_PRIOR_RUN,
        "Initial Question": question
    }
    
    print("Configuration Details:\n")
    for key, value in configurations.items():
        print(f"{key.ljust(25)}: {value}")
    
# Calling the function to print configurations
print_configuration()


Configuration Details:

Project ID               : zacharynguyen-genai
Region                   : us-central1
Experiment               : cigna-handbook
Series                   : applied-genai-v3
Data Storage Option      : ALL
Data Retrieval Source    : GCS
GCS Bucket Name          : zacharynguyen-genai
BigQuery Project         : zacharynguyen-genai
BigQuery Dataset         : applied_genai_v3
BigQuery Table           : cigna-handbook
BigQuery Region          : us
Source Document(s)       : https://www.tn.gov/content/dam/tn/partnersforhealth/documents/cigna_member_handbook_2024.pdf
Use Prior Run            : True
Initial Question         : What situations are considered emergencies and covered if I visit the ER?


## Import Libraries

In [16]:
# Standard library imports for basic operations and concurrency
import os
import io
import json
import base64
import requests
import concurrent.futures
import time
import asyncio
from google.cloud.exceptions import NotFound

# PDF manipulation and IPython display utilities
import PyPDF2
import IPython

# Imaging libraries for image manipulation
from PIL import Image, ImageFont, ImageDraw
import shapely

# Data manipulation and scientific computing libraries
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Vertex AI for machine learning models and Google Cloud services for storage and processing
import vertexai.language_models  # PaLM and Codey Models
import vertexai.generative_models  # for Gemini Models
from google.cloud import documentai, storage, bigquery
from google.api_core import retry
from google.cloud.exceptions import NotFound


## Create Bucket

In [17]:
def create_gcs_bucket(project_id, bucket_name, region):
    storage_client = storage.Client(project=project_id)
    
    # Check if the bucket already exists
    try:
        existing_bucket = storage_client.get_bucket(bucket_name)
        print(f"Bucket {existing_bucket.name} already exists.")
        return existing_bucket
    except NotFound:
        # If the bucket does not exist, proceed to create it
        bucket = storage_client.bucket(bucket_name)
        new_bucket = storage_client.create_bucket(bucket, location=region)
        print(f"Bucket {new_bucket.name} created.")
        return new_bucket

In [18]:
# Create GCS Bucket
create_gcs_bucket(PROJECT_ID, GCS_BUCKET, REGION)

Bucket zacharynguyen-genai already exists.


<Bucket: zacharynguyen-genai>

## Create BigQuery Dataset and Table

In [21]:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

def create_bq_dataset_and_table(project_id, dataset_id, table_id, region):
    """
    Creates a BigQuery dataset and table within it if they do not exist.
    
    Parameters:
    - project_id: str. The GCP project ID.
    - dataset_id: str. The dataset ID within the project.
    - table_id: str. The table ID within the dataset.
    - region: str. The location for the dataset.
    
    Returns:
    - The BigQuery Table object for the newly created or existing table.
    """
    bq_client = bigquery.Client(project=project_id)

    # Dataset reference
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

    # Attempt to get or create the dataset
    try:
        dataset = bq_client.get_dataset(dataset_ref)
        print(f"Dataset {dataset.dataset_id} already exists.")
    except NotFound:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = region
        dataset = bq_client.create_dataset(dataset)
        print(f"Dataset {dataset.dataset_id} created.")

    # Table reference
    table_ref = dataset_ref.table(table_id)

    # Attempt to get or create the table
    try:
        table = bq_client.get_table(table_ref)
        print(f"Table {table.table_id} already exists.")
    except NotFound:
        # Define the table schema
        schema = [
            bigquery.SchemaField("example_field", "STRING", mode="NULLABLE")
        ]
        table = bigquery.Table(table_ref, schema=schema)
        table = bq_client.create_table(table)
        print(f"Table {table.table_id} created.")
    
    # Construct and print the direct link to the table for easy access
    table_link = f"https://console.cloud.google.com/bigquery?project={project_id}&p={project_id}&d={dataset_id}&t={table_id}&page=table"
    print(f"Access your table directly: {table_link}")

    return table


In [23]:
# Create BigQuery Dataset and Table
create_bq_dataset_and_table(PROJECT_ID, BQ_DATASET, BQ_TABLE, BQ_REGION)

Dataset applied_genai_v3 already exists.
Table cigna-handbook created.
Access your table directly: https://console.cloud.google.com/bigquery?project=zacharynguyen-genai&p=zacharynguyen-genai&d=applied_genai_v3&t=cigna-handbook&page=table


Table(TableReference(DatasetReference('zacharynguyen-genai', 'applied_genai_v3'), 'cigna-handbook'))