In [24]:
import os
GOOGLE_APPLICATION_CREDENTIALS = "/Users/zacharynguyen/Documents/GitHub/2024/Applied-Generative-AI/IAM/zacharynguyen-genai-656c475b142a.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS

In [25]:
PROJECT_ID = 'zacharynguyen-genai' # replace with project ID
os.environ["PROJECT_ID"] = PROJECT_ID

---
## Installs and API Enablement

The clients packages may need installing in this environment. 

### Installs (If Needed)

In [26]:
#!pip install -r requirements.txt 


### API Enablement

In [27]:
#!gcloud auth login

In [28]:
#!gcloud services enable aiplatform.googleapis.com
#!gcloud services enable documentai.googleapis.com

---
## Setup

Inputs

In [29]:
project = !gcloud config get-value project
project[1]


'zacharynguyen-genai'

In [30]:
# Project and Experiment Configuration
PROJECT_ID = 'zacharynguyen-genai'
REGION = 'us-central1'
EXPERIMENT = 'mlb-rules'
SERIES = 'applied-genai-v3'

# Storage Configuration
# Determines where to save results (GCS, BQ, or both)
SAVE_IN = 'ALL'
# Specifies the source from which to retrieve previous results, if available
RETRIEVE_FROM = 'GCS'

# Google Cloud Storage (GCS) Bucket Name
GCS_BUCKET = PROJECT_ID  # Using the project ID as the bucket name for simplicity

# BigQuery (BQ) Configuration
# Uses project ID for BQ project. Dataset and table names are derived from the experiment details
BQ_PROJECT = PROJECT_ID
BQ_DATASET = SERIES.replace('-', '_')  # Replacing dashes with underscores for BQ dataset naming
BQ_TABLE = EXPERIMENT
BQ_REGION = REGION[:2]  # Extracts the first two characters from the REGION for BQ region code

# Document Source Configuration
# A list of source document URLs. Supports http:// or gs:// protocols
source_documents = [
    'https://img.mlbstatic.com/mlb-images/image/upload/mlb/wqn5ah4c3qtivwx3jatm.pdf'
]

# Operational Flags
# Determines whether to use data from a previous run or force a new run by deleting existing data
USE_PRIOR_RUN = True

# Initial Question for Processing
question = "How is baseball played?"


In [31]:
def print_configuration():
    print("Document Processing Workflow Configuration\n")
    print(f"Project and Experiment Details:")
    print(f"  Project ID: {PROJECT_ID}")
    print(f"  Region: {REGION}")
    print(f"  Experiment: {EXPERIMENT}")
    print(f"  Series: {SERIES}\n")
    
    print(f"Storage Configuration:")
    print(f"  Results Saving Option: {SAVE_IN}")
    print(f"  Results Retrieval Source: {RETRIEVE_FROM}")
    print(f"  GCS Bucket: {GCS_BUCKET}\n")
    
    print(f"BigQuery Configuration:")
    print(f"  BQ Project: {BQ_PROJECT}")
    print(f"  BQ Dataset: {BQ_DATASET}")
    print(f"  BQ Table: {BQ_TABLE}")
    print(f"  BQ Region: {BQ_REGION}\n")
    
    print(f"Document Source Details:")
    if source_documents:
        for i, doc in enumerate(source_documents, start=1):
            print(f"  Document {i}: {doc}")
    else:
        print("  No source documents specified.\n")
    
    print(f"\nOperational Flags:")
    print(f"  Use Prior Run: {'Yes' if USE_PRIOR_RUN else 'No'}")
    
    print(f"\nInitial Question for Processing:")
    print(f"  Question: \"{question}\"")

# Call the function to print the configuration
print_configuration()


Document Processing Workflow Configuration

Project and Experiment Details:
  Project ID: zacharynguyen-genai
  Region: us-central1
  Experiment: mlb-rules
  Series: applied-genai-v3

Storage Configuration:
  Results Saving Option: ALL
  Results Retrieval Source: GCS
  GCS Bucket: zacharynguyen-genai

BigQuery Configuration:
  BQ Project: zacharynguyen-genai
  BQ Dataset: applied_genai_v3
  BQ Table: mlb-rules
  BQ Region: us

Document Source Details:
  Document 1: https://img.mlbstatic.com/mlb-images/image/upload/mlb/wqn5ah4c3qtivwx3jatm.pdf

Operational Flags:
  Use Prior Run: Yes

Initial Question for Processing:
  Question: "How is baseball played?"


Packages

In [32]:
import os
import io
import json
import base64
import requests
import concurrent.futures
import time
import asyncio

import PyPDF2
import IPython
import PIL, PIL.ImageFont, PIL.Image, PIL.ImageDraw
import shapely

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

import vertexai.language_models # PaLM and Codey Models
import vertexai.generative_models # for Gemini Models
from google.cloud import documentai
from google.cloud import storage
from google.cloud import bigquery
from google.api_core import retry

### Create GCS Bucket

In [33]:
def create_gcs_bucket(project_id, bucket_name, region):
    storage_client = storage.Client(project=project_id)
    
    # Check if the bucket already exists
    try:
        existing_bucket = storage_client.get_bucket(bucket_name)
        print(f"Bucket {existing_bucket.name} already exists.")
        return existing_bucket
    except NotFound:
        # If the bucket does not exist, proceed to create it
        bucket = storage_client.bucket(bucket_name)
        new_bucket = storage_client.create_bucket(bucket, location=region)
        print(f"Bucket {new_bucket.name} created.")
        return new_bucket

In [34]:
# Create GCS Bucket
create_gcs_bucket(PROJECT_ID, GCS_BUCKET, REGION)

Bucket zacharynguyen-genai already exists.


<Bucket: zacharynguyen-genai>

### Create dataset and table in BigQuery

In [35]:
def create_bq_dataset_and_table(project_id, dataset_id, table_id, region):
    bq_client = bigquery.Client(project=project_id)

    # Attempt to get the dataset, create if not exists
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)
    try:
        dataset = bq_client.get_dataset(dataset_ref)  # Make an API request.
        print(f"Dataset {dataset.dataset_id} already exists.")
    except NotFound:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = region
        dataset = bq_client.create_dataset(dataset)  # Make an API request.
        print(f"Dataset {dataset.dataset_id} created.")

    # Attempt to get the table, create if not exists
    table_ref = dataset_ref.table(table_id)
    try:
        table = bq_client.get_table(table_ref)  # Make an API request.
        print(f"Table {table.table_id} already exists.")
    except NotFound:
        schema = [
            bigquery.SchemaField("example_field", "STRING", mode="NULLABLE")
        ]
        table = bigquery.Table(table_ref, schema=schema)
        table = bq_client.create_table(table)  # Make an API request.
        print(f"Table {table.table_id} created.")

    return table


In [36]:
# Create BigQuery Dataset and Table
create_bq_dataset_and_table(PROJECT_ID, BQ_DATASET, BQ_TABLE, BQ_REGION)

Dataset applied_genai_v3 already exists.
Table mlb-rules already exists.


Table(TableReference(DatasetReference('zacharynguyen-genai', 'applied_genai_v3'), 'mlb-rules'))

Clients

In [49]:
from google.cloud import documentai, bigquery, storage
import vertexai

# Initialize Vertex AI with specified project and location
vertexai.init(project=PROJECT_ID, location=REGION)

# Derive the Document AI service location from the REGION variable
LOCATION = REGION.split('-')[0]
docai_api_endpoint = f"{location}-documentai.googleapis.com"

# Initialize the Document AI clients for both synchronous and asynchronous operations
docai_client = documentai.DocumentProcessorServiceClient(client_options={"api_endpoint": docai_api_endpoint})
docai_async_client = documentai.DocumentProcessorServiceAsyncClient(client_options={"api_endpoint": docai_api_endpoint})

# Initialize the BigQuery client for data analytics operations
bq = bigquery.Client(project=PROJECT_ID)

# Initialize the Google Cloud Storage client and retrieve the specified bucket
gcs = storage.Client(project=PROJECT_ID)
bucket = gcs.bucket(GCS_BUCKET)

# Confirmation of clients initialization
print("Clients for Vertex AI, Document AI, BigQuery, and GCS have been initialized successfully.")


Clients for Vertex AI, Document AI, BigQuery, and GCS have been initialized successfully.


---
## Vertex LLM Setup


In [50]:
from vertexai import generative_models, language_models

# Initializing Gemini Models
# Gemini Text Model for advanced text generation and understanding
gemini_text = generative_models.GenerativeModel("gemini-1.0-pro")

# Initializing PaLM Models
# Text Embedding Model for dense vector representations of text
textembed_model = language_models.TextEmbeddingModel.from_pretrained('textembedding-gecko')

# Text Generation Models for various text generation tasks
# Standard Bison Model
text_model_b = language_models.TextGenerationModel.from_pretrained('text-bison')

# Extended Bison Model supporting up to 32k tokens
text_model_b32 = language_models.TextGenerationModel.from_pretrained('text-bison-32k')

# Unicorn Model for a broad range of text generation applications
text_model_u = language_models.TextGenerationModel.from_pretrained('text-unicorn')

# Confirmation of model initialization
print("Vertex AI models initialized successfully:")
print("- Gemini Text Model: gemini_text")
print("- PaLM Text Embedding Model: textembed_model")
print("- PaLM Text Generation Models: text_model_b, text_model_b32, text_model_u")


Vertex AI models initialized successfully:
- Gemini Text Model: gemini_text
- PaLM Text Embedding Model: textembed_model
- PaLM Text Generation Models: text_model_b, text_model_b32, text_model_u


### Prompt

In [51]:
question

'How is baseball played?'

### Embeddings

In [52]:
textembed_model.get_embeddings([question])[0].values[0:5]

[0.02588232420384884,
 -0.03225235641002655,
 -0.01948062889277935,
 0.01719886064529419,
 -0.002375800861045718]

### Generation: PaLM `text-bison`

In [53]:
question

'How is baseball played?'

In [54]:
response = text_model_b.predict(question)
response

 **Objective:**
The objective of baseball is to score runs by hitting the ball and advancing around the bases. The team with the most runs at the end of the game wins.

**Gameplay:**
- A baseball game is played between two teams, each with nine players on the field.
- The game is divided into nine innings, each consisting of two halves: the top of the inning when the visiting team bats, and the bottom of the inning when the home team bats.
- Each team takes turns batting and fielding. The batting team tries to hit the ball into fair territory (between the foul lines) and advance around

In [55]:
response.safety_attributes

{'Derogatory': 0.1, 'Insult': 0.1, 'Sexual': 0.1}

In [56]:
response = text_model_b.predict(question, max_output_tokens = 500)
response

 **Objective:**
The objective of baseball is to score runs by hitting the ball and advancing around the bases. The team with the most runs at the end of the game wins.

**Gameplay:**
- A baseball game is played between two teams, each with nine players on the field.
- The game is divided into nine innings, each consisting of two halves: the top of the inning when the visiting team bats, and the bottom of the inning when the home team bats.
- Each team takes turns batting and fielding. The batting team tries to hit the ball into fair territory (between the foul lines) and advance around the bases. The fielding team tries to prevent the batters from reaching base and to get them out.

**Basic Rules:**
- A player can advance around the bases by hitting the ball and reaching first base, or by being awarded a walk (four balls outside the strike zone).
- A player can also advance around the bases by stealing a base (running to the next base while the pitcher is delivering the ball).
- A play

In [57]:
IPython.display.Markdown(response.text)

 **Objective:**
The objective of baseball is to score runs by hitting the ball and advancing around the bases. The team with the most runs at the end of the game wins.

**Gameplay:**
- A baseball game is played between two teams, each with nine players on the field.
- The game is divided into nine innings, each consisting of two halves: the top of the inning when the visiting team bats, and the bottom of the inning when the home team bats.
- Each team takes turns batting and fielding. The batting team tries to hit the ball into fair territory (between the foul lines) and advance around the bases. The fielding team tries to prevent the batters from reaching base and to get them out.

**Basic Rules:**
- A player can advance around the bases by hitting the ball and reaching first base, or by being awarded a walk (four balls outside the strike zone).
- A player can also advance around the bases by stealing a base (running to the next base while the pitcher is delivering the ball).
- A player scores a run when they touch home plate after advancing around all the bases.
- The pitcher throws the ball from the pitcher's mound to the catcher, who is positioned behind home plate. The batter stands at home plate and tries to hit the ball with a bat.
- If the batter hits the ball into fair territory, they can run to first base. If they reach first base safely, they can try to advance to second base, third base, and home plate.
- The fielders try to catch the ball before it hits the ground or to throw the ball to a fielder who can catch it. If a fielder catches the ball before it hits the ground, the batter is out.
- If the batter hits the ball out of the park (over the fence), they hit a home run and score a run.
- The game ends after nine innings, or after extra innings if the score is tied. The team with the most runs at the end of the game wins.

**Scoring:**
- A team scores a run when a player touches home plate after advancing around all the bases.
- A player can score a run by hitting a home run, by being driven in (batted in) by another player, or by advancing around the bases on an error by the fielding team.

**Outs:**
- A batter is out if they:
  -

### Generation: PaLM `text-unicorn`

In [58]:
response = text_model_u.predict(question)
response

Baseball is a bat-and-ball game played between two teams of nine players each. The game is played on a field with four bases arranged in a diamond shape. The objective of the game is to score runs by hitting the ball and running around the bases.

The game begins with one team batting and the other team fielding. The pitcher throws the ball to the batter, who tries to hit it with the bat. If the batter hits the ball, they run to first base. If they reach first base safely, they can continue running to second, third, and home base. If they reach home base, they score

In [59]:
IPython.display.Markdown(response.text)

Baseball is a bat-and-ball game played between two teams of nine players each. The game is played on a field with four bases arranged in a diamond shape. The objective of the game is to score runs by hitting the ball and running around the bases.

The game begins with one team batting and the other team fielding. The pitcher throws the ball to the batter, who tries to hit it with the bat. If the batter hits the ball, they run to first base. If they reach first base safely, they can continue running to second, third, and home base. If they reach home base, they score

In [60]:
if USE_PRIOR_RUN == False:
    PRIOR_PARSE = False
    
    # do a check for prior run and present message if found letting user know the prior result exists but not being used
    if RETRIEVE_FROM == 'GCS' and len(list(bucket.list_blobs(prefix = f'{SERIES}/{EXPERIMENT}/files_pages.json'))) > 0:
        print(f'Previous results exists in GCS but forcing the creation of new parsing with USE_PRIOR_RUN = {USE_PRIOR_RUN}')
    elif RETRIEVE_FROM == 'BQ' and bq_table_check(f'{BQ_DATASET}.{BQ_TABLE}_files_pages'):
        print(f'Previous results exists in BQ but forcing the creation of new parsing with USE_PRIOR_RUN = {USE_PRIOR_RUN}')

elif RETRIEVE_FROM == 'GCS' and len(list(bucket.list_blobs(prefix = f'{SERIES}/{EXPERIMENT}/files_pages.json'))) > 0:
    print('Importing previous run from GCS')

    # load files_pages: the file+page level information including docai responses in `parsing`
    blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages.json')
    files_pages = [json.loads(line) for line in blob.download_as_text().splitlines()]
    
    # load files_pages_chunks: the chunks parsed from the files+pages
    blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages_chunks.json')
    files_pages_chunks = [json.loads(line) for line in blob.download_as_text().splitlines()]   
    
    # Set Indicator to prevent redoing the parsing later in this notebook
    PRIOR_PARSE = True

elif RETRIEVE_FROM == 'BQ' and bq_table_check(f'{BQ_DATASET}.{BQ_TABLE}_files_pages'):
    print('Importing previous run from BigQuery')

    # load files_pages: the file+page level information including docai responses in `parsing`
    files_pages = bq.query(f'SELECT * FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_files_pages` ORDER BY file_index, page_index').to_dataframe().to_dict('records')
    # convert json string to dictionary:
    for page in files_pages:
        page['parsing'] = json.loads(page['parsing'])
    
    # load files_pages_chunks: the chunks parsed from the files+pages
    files_pages_chunks = bq.query(f'SELECT * FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_files_pages_chunks`').to_dataframe().to_dict('records')
    #convert json string to dictionary:
    for chunk in files_pages_chunks:
        chunk['metadata'] = json.loads(chunk['metadata'])
    # sort chunk by file, page, chunk number:
    files_pages_chunks = sorted(files_pages_chunks, key = lambda x: (x['metadata']['file_index'], x['metadata']['page_index'], x['metadata']['chunk']))

    # Set Indicator to prevent redoing the parsing later in this notebook
    PRIOR_PARSE = True
        
else:
    print('No previous run available to import')
    PRIOR_PARSE = False

Importing previous run from GCS


### Generation: PaLM `text-bison-32k`

In [61]:
response = text_model_b32.predict(question)
response

 **Objective:**
The objective of baseball is to score runs by hitting the ball and advancing around the bases. The team with the most runs at the end of the game wins.

**Gameplay:**
- A baseball game is played between two teams, each with nine players on the field.
- The game is divided into nine innings, each consisting of two halves: the top of the inning when the visiting team bats, and the bottom of the inning when the home team bats.
- Each team takes turns batting and fielding. The batting team tries to hit the ball into fair territory (between the foul lines) and advance around

In [67]:
IPython.display.Markdown(response.text)

**Objective:**

The goal of baseball is for one team (the offense) to score more runs than the other team (the defense) by hitting a ball and running around bases.

**Setup and Equipment:**

* **Field:** A baseball field is diamond shaped with four bases: home, first, second and third.
* **Ball and Bat:** The baseball is a small, leather-covered sphere. The bat is made of wood or metal.

**Inning and Game Play:**

* A baseball game consists of nine innings. Each inning is divided into two parts, a top half and a bottom half.
* The offense (batting team) takes the field first in the top of the inning, trying to score runs by hitting the ball.
* The defense (fielding team) positions players around the field to try to prevent runs by catching or fielding the ball.

**Hitting:**

* The pitcher from the defense team throws a ball towards the batter from the offense team.
* If the batter hits a fair ball (within certain lines on the field), they attempt to run to first base.

**Baserunning and Scoring Runs:**

* After a batter hits a fair ball, they run around the bases in order (first, second, third, and finally, home).
* If the runner reaches home plate before three defensive outs (explained below) are made, the offense scores a point (called a "run").

**Outs:**

There are several ways a defensive team can get an "out":
* **Strikeouts (3):** Three missed swings or balls hit outside of the strike zone.
* **Groundouts:** A batter hits the ball on the ground and it is fielded and thrown to first base before the batter can reach it.
* **Flyouts:** A batter hits the ball into the air, and it is caught before it hits the ground.
* **Tagging out a runner:** Touching a runner with a live ball or a glove containing the ball.
* **Force outs (rare):** A runner is forced to advance to the next base due to a batter hitting the ball, and the defense completes a play on that base.

**Pitching:**

* The投手(投手隊的球手)向打擊手投球。目標是在好壞球區(好壞球區)投球，並迫使其揮空或擊出好打的球。
* **好壞球區:**一個想像的盒子狀區域，介於本壘板上方與膝蓋以下、本壘板內緣與外緣間。投手必須將球投進這個區域內。
* **四壞保送 (4):**投手投出四個壞球，打擊者可以自動上到一壘。
* **觸身球:**投手投出的球擊中打擊者，打擊者可自動上到一壘。

**其他規則和策略：**

* **盜壘：**一名跑壘員在投手投球時嘗試從一壘推進到二壘或三壘。
* **野手選擇：**球隊可以選擇不完成對跑者的出局機會，而是選擇將球傳到另一壘，讓其他跑者出界。
* **犧牲打：**打擊者犧牲自己出界，以推進跑壘員上到另一個壘包。
* **策略性走壘（策略性跑壘）：**球隊利用跑壘員的速度、技巧和判斷力，以增加得分機會。

### Generation: Gemini `gemini-1.0-pro`

In [68]:
response = gemini_text.generate_content(question)
response

candidates {
  content {
    role: "model"
    parts {
      text: "**Objective:**\n\n* To score more runs than the opposing team by advancing runners around the bases and touching home plate (the \"plate\").\n\n**Equipment:**\n\n* Baseball (a hard leather ball)\n* Baseball bat\n* Baseball glove\n* Baseball field (with four bases)\n\n**Gameplay:**\n\n**Inning:**\n* A game consists of nine innings, with each inning divided into two halves: the top and bottom of the inning.\n\n**Top of the Inning:**\n* The visiting team bats while the home team takes the field.\n* The visiting team\'s goal is to score runs.\n\n**Bottom of the Inning:**\n* The home team bats while the visiting team takes the field.\n* The home team\'s goal is to prevent the visiting team from scoring more runs than them.\n\n**Gameplay within an Inning:**\n\n**Pitching:**\n* The pitcher on the fielding team throws the ball over home plate towards the batter on the batting team.\n\n**Batting:**\n* The batter attempts to hit

In [69]:
print(response.text)

**Objective:**

* To score more runs than the opposing team by advancing runners around the bases and touching home plate (the "plate").

**Equipment:**

* Baseball (a hard leather ball)
* Baseball bat
* Baseball glove
* Baseball field (with four bases)

**Gameplay:**

**Inning:**
* A game consists of nine innings, with each inning divided into two halves: the top and bottom of the inning.

**Top of the Inning:**
* The visiting team bats while the home team takes the field.
* The visiting team's goal is to score runs.

**Bottom of the Inning:**
* The home team bats while the visiting team takes the field.
* The home team's goal is to prevent the visiting team from scoring more runs than them.

**Gameplay within an Inning:**

**Pitching:**
* The pitcher on the fielding team throws the ball over home plate towards the batter on the batting team.

**Batting:**
* The batter attempts to hit the pitched ball with their bat.

**Hitting:**
* If the batter successfully hits the ball, they drop 

In [70]:
IPython.display.Markdown(response.text)

**Objective:**

* To score more runs than the opposing team by advancing runners around the bases and touching home plate (the "plate").

**Equipment:**

* Baseball (a hard leather ball)
* Baseball bat
* Baseball glove
* Baseball field (with four bases)

**Gameplay:**

**Inning:**
* A game consists of nine innings, with each inning divided into two halves: the top and bottom of the inning.

**Top of the Inning:**
* The visiting team bats while the home team takes the field.
* The visiting team's goal is to score runs.

**Bottom of the Inning:**
* The home team bats while the visiting team takes the field.
* The home team's goal is to prevent the visiting team from scoring more runs than them.

**Gameplay within an Inning:**

**Pitching:**
* The pitcher on the fielding team throws the ball over home plate towards the batter on the batting team.

**Batting:**
* The batter attempts to hit the pitched ball with their bat.

**Hitting:**
* If the batter successfully hits the ball, they drop their bat and try to run to first base before the fielders can retrieve the ball and tag them out or throw the ball to a fielder who can tag them out.

**Running the Bases:**
* Once the batter hits the ball, they try to run around the bases in order: first, second, third, and home plate.
* Runners on base can advance by stealing bases or by being driven in by a batter who hits the ball into the outfield.

**Fielding:**
* The fielders try to retrieve the hit ball and either throw the ball to a base to prevent runners from advancing or to tag runners out by touching them with the ball.

**Outs:**
* Three outs mark the end of an inning for the batting team. Outs can occur in the following ways:
    * Strikeout: The batter misses three pitches.
    * Caught out: The fielder catches the ball before it hits the ground.
    * Force out: A runner is forced to run to the next base because a preceding runner is already on that base.
    * Tag out: A fielder tags the runner with the ball before the runner reaches a base.

**Runs:**
* A run is scored when a runner successfully touches home plate after having advanced around all four bases.

**Winning:**
* The team with the most runs at the end of the game wins.

---
## Retrieve Files From Previous Run on GCS Or BigQuery

This uses the input parameter set above: `RETRIEVE_FROM`.  If it is set to `BQ` or `GCS` then it will check the source for an available prior run and retrieve it if it exists.


Function to check for existance of BigQuery Table:

In [71]:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

def bq_table_check(bq_client, full_table_id):
    """
    Checks if a specified BigQuery table exists.

    Parameters:
    - bq_client (bigquery.Client): A BigQuery client object.
    - full_table_id (str): The full table ID in the format 'dataset.table_name'.

    Returns:
    - bool: True if the table exists, False otherwise.
    """
    try:
        bq_client.get_table(full_table_id)
        return True
    except NotFound:
        return False

# Example usage:
bq_client = bigquery.Client(project=PROJECT_ID)

# Check if specific tables exist within a dataset
table_exists = bq_table_check(bq_client, f'{BQ_DATASET}.{BQ_TABLE}_files_pages')
chunks_table_exists = bq_table_check(bq_client, f'{BQ_DATASET}.{BQ_TABLE}_files_pages_chunks')

print(f"Table exists: {table_exists}, Chunks table exists: {chunks_table_exists}")


Table exists: True, Chunks table exists: True


---
## Get/Create Document AI Processors

Document AI is comprised of multiple processors.  In this case the Form parser is used for its ability to detect and extract tables as well as OCR.  For a more thorough review of Document AI processors, including customized parsers, see the [Working With/Document AI](../Working%20With/Document%20AI/readme.md) section of this repository.  This repository includes example of processing document at larger scales and storing the data for processing and retrieval.

Using the [General Form Processor](https://cloud.google.com/document-ai/docs/processors-list#general_processors).

In [72]:
PARSER_DISPLAY_NAME = 'my_general_processor'
PARSER_TYPE = 'FORM_PARSER_PROCESSOR'
PARSER_VERSION = 'pretrained-form-parser-v2.1-2023-06-26'

for p in docai_client.list_processors(parent = f'projects/{PROJECT_ID}/locations/{LOCATION}'):
    if p.display_name == PARSER_DISPLAY_NAME:
        parser = p
try:
    print('Retrieved existing parser: ', parser.name)
except Exception:
    parser = docai_client.create_processor(
        parent = f'projects/{PROJECT_ID}/locations/{LOCATION}',
        processor = dict(display_name = PARSER_DISPLAY_NAME, type_ = PARSER_TYPE, default_processor_version = PARSER_VERSION)
    )
    print('Created New Parser: ', parser.name)

NameError: name 'LOCATION' is not defined

---
## Get The Documents

Get the source PDF(s) from GCS or a URL and store as a list of pages for each file: `file_pages`.

Get Location of Files

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    document_locations = []
    for source_document in source_documents:
        if source_document.startswith('http'):
            document_locations.append('URL')
            print(f'Use requests to get online document: {source_document}')
        elif source_document.startswith('gs'):
            document_locations.append('GCS')
            print(f'Use GCS to get document in GCS: {source_document}')
        else:
            document_locations.append('UNKNOWN')
            print(f'The source_document variable points to a document in an unknown location type (not gs:// or http): {source_document}')

Import the PDF to memory as bytes:

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    imported_documents = []
    for s, source_document in enumerate(source_documents):
        if document_locations[s] == 'URL':
            imported_documents.append(requests.get(source_document).content)
        elif document_locations[s] == 'GCS':
            blob = bucket.blob(source_document.split(f'gs://{GCS_BUCKET}/')[1])
            imported_documents.append(blob.download_as_bytes())
        elif document_locations[s] == 'UNKNOWN':
            imported_documents.append(None)
    type(imported_documents[0])       

Convert from bytes to PDF:

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    converted_documents = []
    for imported_document in imported_documents:
        if imported_document:
            converted_documents.append(PyPDF2.PdfReader(io.BytesIO(imported_document)))
        else:
            converted_documents.append(None)
        type(converted_documents[0])    

Review number of pages per PDF:

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    for f, file in enumerate(converted_documents):
        if file:
            print(f"{source_documents[f]} has {len(file.pages)} pages")

Split PDF(s) to list of individual pages for each file:

List of dictionaries with keys: file_index, page_index, raw_file_page

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    # list of tuples (file index, page number, page content)
    files_pages = []
    for c, converted_document in enumerate(converted_documents):
        if converted_document:
            for page_num, page in enumerate(converted_document.pages, 1):
                writer = PyPDF2.PdfWriter()
                writer.add_page(page)
                with io.BytesIO() as bytes_stream:
                    files_pages.append(
                        dict(file_index = c, page_index = page_num, raw_file_page = writer.write(bytes_stream)[1].getbuffer().tobytes())
                    )
len(files_pages)

---
## Parse Documents

Results of:
- [google.cloud.documentai.DocumentProcessorServiceClient().process_document()](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.services.document_processor_service.DocumentProcessorServiceClient#google_cloud_documentai_v1_services_document_processor_service_DocumentProcessorServiceClient_process_document)
  - are in the format of
    - [google.cloud.documentai_v1.types.ProcessResponse()](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.ProcessResponse)
      - which contains `.document` in the format of:
        - [google.cloud.documentai_v1.types.Document](https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document)

Converting the Document to:
- JSON with .to_json()
- dictionary with .to_dict()

**Document AI Notes:**
- In this application we are using online processing.  This has a limit of 15 pages per document.  Switch to batch increases this to 100 pages for the Form Parser (General).
- Online processing has a default qouta of 120 requests per minute per project. The code below implements waiting time to avoid this limit.
- [Reference](https://cloud.google.com/document-ai/quotas)

In [None]:
async def docai_runner(files_pages, limit_concur_requests = 120):
    limit = asyncio.Semaphore(limit_concur_requests)
    results = [None] * len(files_pages)
    
    # make requests - async
    async def make_request(p):
        
        async with limit:
            if limit.locked():
                await asyncio.sleep(0.01)
                
            ########### manual Error Handling ############################################
            fail_count = 0
            while fail_count <= 20:
                try:
                    result = await docai_async_client.process_document(
                        request = dict(
                            raw_document = documentai.RawDocument(
                                content = files_pages[p]['raw_file_page'],
                                mime_type = 'application/pdf'
                            ),
                            name = parser.name
                        )
                    )
                    if fail_count > 0:
                        print(f'Item {p} succeeded after fail count = {fail_count}')
                    break
                except:
                    fail_count += 1
                    #print(f'Item {p} failed: current fail count = {fail_count}')
                    await asyncio.sleep(2^(min(fail_count, 6) - 1))
            ##############################################################################
            
        results[p] = documentai.Document.to_dict(result.document)
    
    # manage tasks
    tasks = [asyncio.create_task(make_request(p)) for p in range(len(files_pages))]
    responses = await asyncio.gather(*tasks)
    
    # add parsing to input list of dictionaries for all the pages
    for c, content in enumerate(files_pages):
        content['parsing'] = results[c]
    
    return

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    print('No Prior Results, Parsing with Document AI')
    await docai_runner(files_pages)
    # remove the raw file page
    for page in files_pages: del page['raw_file_page']

In [None]:
len(files_pages)

In [None]:
files_pages[0].keys()

In [None]:
files_pages[0]['parsing'].keys()

For each pages dictionary, add the path, file, page, and embedding of the full full pages OCR results:

In [None]:
async def embedding_pages(files_pages, limit_concur_requests = 500):
    limit = asyncio.Semaphore(limit_concur_requests)
    results = [None] * len(files_pages)
    
    # make requests - async
    async def make_request(p):
        
        async with limit:
            if limit.locked():
                await asyncio.sleep(0.01)
                
            ########### manual Error Handling ############################################
            fail_count = 0
            while fail_count <= 20:
                try:
                    if files_pages[p]['parsing']['text']:
                        result = await textembed_model.get_embeddings_async([files_pages[p]['parsing']['text']])
                    else:
                        obj = lambda: None
                        obj.values = [None]
                        result = [obj]
                    
                    if fail_count > 0:
                        print(f'Item {p} succeeded after fail count = {fail_count}')
                    break
                except:
                    fail_count += 1
                    #print(f'Item {p} failed: current fail count = {fail_count}')
                    await asyncio.sleep(2^(min(fail_count, 6) - 1))
            ##############################################################################
            
        results[p] = result[0].values
    
    # manage tasks
    tasks = [asyncio.create_task(make_request(p)) for p in range(len(files_pages))]
    responses = await asyncio.gather(*tasks)
    
    for c, content in enumerate(files_pages):
        content['parsing']['embedding'] = results[c]
    
    return

In [None]:
if PRIOR_PARSE:
    print('Using Prior Results')
else:
    print('No Prior Results, Using Document AI Parsing')
    await embedding_pages(files_pages)
    for c, content in enumerate(files_pages):
        document_image = PIL.Image.open(
            io.BytesIO(
                base64.decodebytes(content['parsing']['pages'][0]['image']['content'].encode('utf-8'))
            )
        )
        content['parsing']['path'] = source_documents[content['file_index']][:(-1*len(source_documents[content['file_index']].split('/')[-1]))]
        content['parsing']['file'] = source_documents[content['file_index']].split('/')[-1]
        content['parsing']['page'] = content['page_index']
        content['parsing']['vme_id'] = f"{content['file_index']}_{content['page_index']}"
        content['parsing']['dimensions'] = list(document_image.size)
        if not content['parsing']['text']:
            content['parsing']['embedding'] = []

In [None]:
len(files_pages)

In [None]:
files_pages[0].keys()

In [None]:
files_pages[0]['parsing'].keys()

### Parse Chunks From Documents

Elements to capture here are paragraphs and tables.  If a paragraph overlaps a table then include it within the table.

In [None]:
if PRIOR_PARSE:
    print('Using Prior Document Preparation')
else:
    files_pages_chunks = []
    
    for content in files_pages:
        page = content['parsing']
        chunk_id = 0
        
        # parse tables from page:
        tables = []
        for t, table in enumerate(page['pages'][0]['tables']):

            table_txt = ''
            if 'text_anchor' in table['layout'].keys():
                for s, segment in enumerate(table['layout']['text_anchor']['text_segments']):
                    if t == 0 and s == 0: start = 0
                    else: start = int(segment['start_index'])
                    end = int(segment['end_index'])
                    table_txt += page['text'][start:end+t]

            vertices = []
            normalized_vertices = []
            for vertex in table['layout']['bounding_poly']['normalized_vertices']:
                normalized_vertices.append(dict(x = vertex['x'], y = vertex['y']))
                vertices.append(dict(x = vertex['x'] * page['dimensions'][0], y = vertex['y'] * page['dimensions'][1]))
            tables.append(shapely.geometry.Polygon([(v['x'], v['y']) for v in vertices]))

            if table_txt != '':
                files_pages_chunks.append(
                    dict(
                        text = table_txt,
                        metadata = dict(
                            file_index = content['file_index'],
                            page_index = content['page_index'],
                            table = t + 1,
                            chunk = chunk_id + 1,
                            vme_id = page['vme_id'] + '_' + str(chunk_id),
                            vertices = vertices,
                            normalized_vertices = normalized_vertices
                        )
                    )
                )
                chunk_id += 1       
        
        # parse paragraphs from page - not in tables or overlapping tables
        for g, paragraph in enumerate(page['pages'][0]['paragraphs']):

            # get the paragraph text
            paragraph_txt = ''
            for s, segment in enumerate(paragraph['layout']['text_anchor']['text_segments']):
                if g == 0 and s == 0: start = 0
                else: start = int(segment['start_index'])
                end = int(segment['end_index'])
                paragraph_txt += page['text'][start:end+1]

            # if paragraph not empty, get/calc the vertices
            if paragraph_txt != '':
                use_paragraph = True
                vertices = []
                normalized_vertices = []
                for vertex in paragraph['layout']['bounding_poly']['normalized_vertices']:
                    normalized_vertices.append(dict(x = vertex['x'], y = vertex['y']))
                    vertices.append(dict(x = vertex['x'] * page['dimensions'][0], y = vertex['y'] * page['dimensions'][1]))
            else:
                use_paragraph = False

            # only use paragraphs that are not within/overlapping table boundaries
            if use_paragraph:
                for t_shape in tables:
                    p_shape = shapely.geometry.Polygon([(v['x'], v['y']) for v in vertices])
                    if p_shape.intersects(t_shape):
                        use_paragraph = False

            # save the paragraph as an element
            if use_paragraph:
                files_pages_chunks.append(
                    dict(
                        text = paragraph_txt,
                        metadata = dict(
                            file_index = content['file_index'],
                            page_index = content['page_index'],
                            paragraph = g + 1,
                            chunk = chunk_id +1,
                            vme_id = page['vme_id'] + '_' + str(chunk_id),
                            vertices = vertices,
                            normalized_vertices = normalized_vertices
                        )
                    )
                )
                chunk_id += 1        

In [None]:
len(files_pages_chunks)

In [None]:
files_pages_chunks[0].keys()

In [None]:
files_pages_chunks[0]['metadata'].keys()

---
## Get Embeddings


The `textembedding-gecko` model has quota of 1500 request per minute:
- [Quotas by region and model](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas#quotas_by_region_and_model)

In [None]:
async def embedding_runner(files_pages_chunks, limit_concur_requests = 500):
    limit = asyncio.Semaphore(limit_concur_requests)
    results = [None] * len(files_pages_chunks)
    
    # make requests - async
    async def make_request(p):
        
        async with limit:
            if limit.locked():
                await asyncio.sleep(0.01)
                
            ########### manual Error Handling ############################################
            fail_count = 0
            while fail_count <= 20:
                try:
                    result = await textembed_model.get_embeddings_async([files_pages_chunks[p]['text']])
                    if fail_count > 0:
                        print(f'Item {p} succeeded after fail count = {fail_count}')
                    break
                except:
                    fail_count += 1
                    #print(f'Item {p} failed: current fail count = {fail_count}')
                    await asyncio.sleep(2^(min(fail_count, 6) - 1))
            ##############################################################################
            
        results[p] = result[0].values
    
    # manage tasks
    tasks = [asyncio.create_task(make_request(p)) for p in range(len(files_pages_chunks))]
    responses = await asyncio.gather(*tasks)
    
    # add embeddings to input list of dictionaries for all the chunks
    for c, content in enumerate(files_pages_chunks):
        content['embedding'] = results[c]
    
    await asyncio.sleep(60)
    
    return

In [None]:
if PRIOR_PARSE:
    print('Embeddings created on previous run.')
else:
    await embedding_runner(files_pages_chunks)

In [None]:
files_pages_chunks[0].keys()

In [None]:
files_pages_chunks[0]['metadata'].keys()

In [None]:
files_pages_chunks[0]['embedding'][0:5]

---
## Save Files For Future Runs: GCS, BigQuery

Use the values of the input parameter `SAVE_IN` to optionally write both `results` and `documents` to `BQ`, `GCS` or `ALL` (both).

It can take awhile to run the parsing job above so save results for future runs of this notebook.  Also, this prevents recurring cost of running the Document AI parsing of the documents.

In [None]:
files_pages[0].keys()

In [None]:
files_pages_chunks[0].keys()

In [None]:
if PRIOR_PARSE:
    print('This run loaded results from a prior run.  Not overwriting.')
else:
    if SAVE_IN in ['GCS', 'ALL']:
        print('Writing contents of results and documents to GCS for future use.')

        # save files_pages: json lines                    
        blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages.json')
        blob.upload_from_string('\n'.join([json.dumps(page) for page in files_pages]), content_type = 'application/json')

        # save files_pages_elements: json lines
        blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages_chunks.json')
        blob.upload_from_string('\n'.join([json.dumps(chunk) for chunk in files_pages_chunks]), content_type = 'application/json')

    if SAVE_IN in ['BQ', 'ALL']:
        print('Writing contents of results and documents to BigQuery for future use.')

        # create/link to dataset
        ds = bigquery.DatasetReference(BQ_PROJECT, BQ_DATASET)
        ds.location = BQ_REGION
        ds.labels = {'series': f'{SERIES}', 'experiment': f'{EXPERIMENT}'}
        ds = bq.create_dataset(dataset = ds, exists_ok = True)  

        # save files_pages
        load_job = bq.load_table_from_json(
            json_rows = files_pages,
            destination = ds.table(BQ_TABLE + '_files_pages'),
            job_config = bigquery.LoadJobConfig(
                source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
                write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE, #.WRITE_APPEND, #.WRITE_TRUNCATE, #.WRITE_EMPTY
                create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED, #.CREATE_NEVER
                #schema_update_options = [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION],
                #autodetect = True
                schema = [
                    bigquery.SchemaField("file_index", "INT64"),
                    bigquery.SchemaField("page_index", "INT64"),
                    bigquery.SchemaField("parsing", "JSON")
                ]
            ) 
        )
        load_job.result()
        
        # save files_pages_chunks
        load_job = bq.load_table_from_json(
            json_rows = files_pages_chunks,
            destination = ds.table(BQ_TABLE + '_files_pages_chunks'),
            job_config = bigquery.LoadJobConfig(
                source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
                write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE, #.WRITE_APPEND, #.WRITE_TRUNCATE, #.WRITE_EMPTY
                create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED, #.CREATE_NEVER
                #schema_update_options = [bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION],
                #autodetect = True
                schema = [
                    bigquery.SchemaField("text", "STRING"),
                    bigquery.SchemaField("metadata", "JSON"),
                    bigquery.SchemaField("embedding", "FLOAT", "REPEATED")
                ]
            ) 
        )
        load_job.result()

---
## Embeddings Search: AKA Vector Search

There are many ways to do vector search. In production there are considerations, including:
- How many indexes?
- What is the size of indexes?
- What is the lifespan and frequency of indexes?
- How frequently will indexes get updates (append, update, delete)?
- How quickly do updates need to surface in searches?
- What is the next step after a search?  Does the coorespondinng text need to be retrieved separately?

This notebook is designed to show the workflow across a few documents and uses brute for search of all embeddings locally using the common Python package `numpy`.  But what about scaling in production?

Google Cloud offers solutions for any workflow!
- Local to the application:
    - [ScAAN](https://github.com/google-research/google-research/tree/master/scann)
    - [Faiss](https://github.com/facebookresearch/faiss)
- With transactional data:
    - using [pgvector](https://github.com/pgvector/pgvector) with:
        - [Cloud SQL for PostgreSQL](https://cloud.google.com/sql/docs/postgres)
        - [AlloyDB for PostgreSQL](https://cloud.google.com/alloydb/docs)
        - blogs:
            - [Building AI-powered apps on Google Cloud databases using pgvector, LLMs and LangChain](https://cloud.google.com/blog/products/databases/using-pgvector-llms-and-langchain-with-google-cloud-databases)
    - [Spanner](https://cloud.google.com/spanner/docs)
        - [Vector Search in Spanner](https://cloud.google.com/spanner/docs/find-k-nearest-neighbors)
        - [langchain with Spanner](https://github.com/googleapis/langchain-google-spanner-python)
- In the data warehouse:
    - [BigQuery Vector Indexes](https://cloud.google.com/bigquery/docs/vector-search-intro)
- Fit-for-purpose: Fast, Scalable, and Flexible:
    - [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore/latest/overview) with built-in [Search using embeddings](https://cloud.google.com/vertex-ai/docs/featurestore/latest/embeddings-search) 
    - [Vertex AI Vector Search](https://cloud.google.com/vertex-ai/docs/vector-search/overview)

### Get Embedding for Question

In [None]:
print(question)

In [None]:
query_embed = np.array(textembed_model.get_embeddings([question])[0].values)
query_embed.shape

### Create Local Embedding DB - With Numpy!

In [None]:
embed_db = np.array([chunk['embedding'] for chunk in files_pages_chunks])
embed_db.shape

### Get Matches - With Numpy!

#### Dot Product

- measures alignment between two vectors
- large positive implies similar directions
- large negative implies opposite directions
- near zero implies orthoganal
- larger is more similar
- best for:
    - matching and retrieval


In [None]:
similarity = np.dot(query_embed, embed_db.T)
similarity.shape

In [None]:
# larger is more similar
matches = np.argsort(similarity)[::-1][:5].tolist()
matches = [(match, similarity[match]) for match in matches]
matches

In [None]:
for match in matches:
    print(files_pages_chunks[match[0]]['text'])

#### Cosine Similarity

- measures angle between two vectors
- represents the cosine of the angles resulting in values -1 to 1
- larger is more similar
- best for:
    - grouping for topics: different magnitudes can still group together
    - collaborative filtering for recommendation systems


In [None]:
cosine_similarity = similarity / (np.linalg.norm(query_embed) * np.linalg.norm(embed_db, axis = 1).T)
cosine_similarity.shape

In [None]:
# larger is more similar
matches = np.argsort(cosine_similarity)[::-1][:5].tolist()
matches = [(match, cosine_similarity[match]) for match in matches]
matches

In [None]:
for match in matches:
    print(files_pages_chunks[match[0]]['text'])

#### Euclidean Distance

- straight line distance between two vector points
- smaller is more similar
- note that smaller magnitude vectors with large angles may be deemed more similar than larger magnitude vectors with small angles
- best for:
    - clustering points in vector space
    - anomaly detection

In [None]:
euclidean_similarity = np.linalg.norm(embed_db - query_embed, axis = 1)
euclidean_similarity.shape

In [None]:
# smaller is more similar
matches = np.argsort(euclidean_similarity)[:5].tolist()
matches = [(match, euclidean_similarity[match]) for match in matches]
matches

In [None]:
for match in matches:
    print(files_pages_chunks[match[0]]['text'])

### Search Function: Pages

Using dot product, create an embedding database and accompanying search function.

In [None]:
pages_embed_db = np.array([page['parsing']['embedding'] if page['parsing']['embedding'] else [0]*768 for page in files_pages])

In [None]:
def page_match(query):
    query_embed = np.array(textembed_model.get_embeddings([query])[0].values)
    similarity = np.dot(query_embed, pages_embed_db.T)
    matches = np.argsort(similarity)[::-1].tolist()
    # algorithm to dynamically pick k
    k = 1 + 3*int(10*(1-similarity[matches[0]]))
    matches = [(match, similarity[match]) for match in matches[0:k]]
    return matches

In [None]:
page_match(question)

### Search Function: Chunks

Using dot product, create an embedding database and accompanying search function.

In [None]:
chunks_embed_db = np.array([chunk['embedding'] for chunk in files_pages_chunks])

In [None]:
def chunk_match(query):
    query_embed = np.array(textembed_model.get_embeddings([query])[0].values)
    similarity = np.dot(query_embed, chunks_embed_db.T)
    matches = np.argsort(similarity)[::-1].tolist()
    # algorithm to dynamically pick k
    k = 1 + 3*int(10*(1-similarity[matches[0]]))
    matches = [(match, similarity[match]) for match in matches[0:k]]
    return matches

In [None]:
chunk_match(question)

---
## Visualize Relationships Between Chunks

The embeddings are incredibly high dimensional - 768!  To reduce this to a lower dimension, like 2 for plotting, the method [`t-SNE`](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) will be used.  This method models each embedding with a lower 2-dimensional point in a way that similar embeddings are modeled by nearby poiints and dissimilar embeddings by farther points.

### Visualizing Chunks

In [None]:
tsne = TSNE(random_state=0, n_iter=1000, init = 'pca', learning_rate = 'auto')
tsne_results = tsne.fit_transform(chunks_embed_db)

In [None]:
df_tsne = pd.DataFrame(data = tsne_results, columns = ["TSNE1", "TSNE2"])

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data = df_tsne, x = 'TSNE1', y = 'TSNE2')
plt.title("Scatter plot of chunks using t-SNE")
plt.xlabel("TSNE1")
plt.ylabel("TSNE2")
plt.axis("equal")

### Visualizing Chunks - Color Codeded By Similarity To Prompt

In [None]:
query_embed = np.array(textembed_model.get_embeddings([question])[0].values)
similarity = np.dot(query_embed, chunks_embed_db.T)
df_tsne['similarity'] = similarity.tolist()

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data = df_tsne, x = 'TSNE1', y = 'TSNE2', hue = 'similarity', palette = sns.color_palette('coolwarm', as_cmap=True))
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Scatter plot of chunks using t-SNE\nWith Similarity To Prompt Color Coding")
plt.xlabel("TSNE1")
plt.ylabel("TSNE2")
plt.axis("equal")

### Visualizing Chunks - Call Out Matches

In [None]:
matches = chunk_match(question)

In [None]:
df_tsne.iloc[[match[0] for match in matches]]

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data = df_tsne, x = 'TSNE1', y = 'TSNE2', hue = 'similarity', palette = sns.color_palette('coolwarm', as_cmap=True))
sns.scatterplot(data = df_tsne.iloc[[match[0] for match in matches]], x="TSNE1", y="TSNE2", color = 'black', marker = 's', s = 100)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.title("Scatter plot of chunks using t-SNE\nWith Similarity To Prompt Color Coding\nMatches Marked With Black Squares")
plt.xlabel("TSNE1")
plt.ylabel("TSNE2")
plt.axis("equal")

---
## Review Structure of Objects: `files_pages`, `files_pages_elements`

In [None]:
files_pages[0].keys()

In [None]:
files_pages[0]['parsing'].keys()

In [None]:
files_pages_chunks[0].keys()

In [None]:
files_pages_chunks[0]['metadata'].keys()

---
## Q&A With DocumentBot

Make a function that receives the users questions and:
- finds and retrieves relative sections of the rules
- prepares a prompt for Vertex AI Generative AI that includes the question and the context = sections of document
- Retrieves the response (answer) from Vertex AI Generative AI
- Retrieves the closest match section of the rules to the response/answer.
- Prepares and presents all the information back to the user

### Functions For Bot

#### Retrieval Functions

These retrieve context.

**NOTE**: `get_retrieval` calls `expand_retrieval` which call `get_retrieval`.  This can lead to infinate recursion but is prevent by `expand_retrieval` calling `get_retrieval` with `DISTANCE` left at default to prevent a further call to `expand_retrieval`.  Max recursion is 1 in this case. 

In [None]:
def get_chunks(query, k = -1, simk = -1):
    # k set the number of matches to retrieve, regarless of similarity. k = -1 will trigger calculating k dynamically.
    # simk sets a threshold for similarity: <=0 uses k, (0,1] will get all matches with similarity in range [1-simk, 1] 
    
    query_embed = np.array(textembed_model.get_embeddings([query])[0].values)
    similarity = np.dot(query_embed, chunks_embed_db.T) # for dot product, higher is better match, since normalized embeddings 1 is best, 0 is worst
    matches = np.argsort(similarity)[::-1].tolist()
    
    if k <= 0:
        # algorithm to dynamically pick k
        k = 1 + 3*int(10*(1-similarity[matches[0]]))
    if simk <= 0:
        matches = [(match, similarity[match]) for match in matches[0:k]]
    elif simk > 0 and simk <= 1:
        indicies = np.where(similarity >= 1-simk)[0]
        matches = [(i, similarity[i]) for i in indicies]
     
    return matches

def expand_retrieval(contexts, DISTANCE):
    
    additional_contexts = []
    if DISTANCE > 0:
        
        # for each page look for surrounding chunks, collect chunks
        chunk_indexes = []
        for context in contexts:
            # get matches for the page from contexts
            matches = get_retrieval(context[2], simk = DISTANCE, file_page = (context[3]['file_index'], context[3]['page_index']))
            for match in matches:
                if match[0] not in chunk_indexes and match[0] not in [c[0] for c in contexts]:
                    chunk_indexes += [match[0]]
                    additional_contexts.append(match)

    return additional_contexts

def get_retrieval(question, k = -1, simk = -1, DISTANCE = 0, file_page = None):

    if file_page: # this is from a call to this function by expand_retrieval
        matches = [match + (files_pages_chunks[match[0]]['text'], files_pages_chunks[match[0]]['metadata'], True) for match in get_chunks(question, k = k, simk = simk) if file_page == (files_pages_chunks[match[0]]['metadata']['file_index'], files_pages_chunks[match[0]]['metadata']['page_index'])]
    else: # this is from a call to this function by the main function: document_bot
        matches = [match + (files_pages_chunks[match[0]]['text'], files_pages_chunks[match[0]]['metadata'], False) for match in get_chunks(question, k = k, simk = simk)]
    
    if DISTANCE > 0:
        matches = matches + expand_retrieval(matches, DISTANCE)
    
    return matches

#### Augmentation Functions

This function prepares the prompt by also adding retrieved context = augmenting.

In [None]:
def get_augmented(question, contexts, ground):
    prompt = ''
    
    if ground:
        prompt += "Give a detailed answer to the question using only the information from the numbered contexts provided below."
        prompt += "\n\nContexts:\n"
        prompt += "\n".join([f'  * Context {c+1}: "{context[2]}"' for c, context in enumerate(contexts)])
        prompt += "\n\nQuestion: " + question
    else:
        prompt += "Question: " + question
        
    # add the trigger to the prompt.  In this case, also include the zero shot chain of thought prompt "think step by step".
    prompt += "\n\nAnswer the question and give and explanation. Think step by step."
    
    return prompt

#### Generation Functions

These functions interact with LLMs to create responses.

In [None]:
def generate_gemini(prompt, genconfigs, model):
    response = model.generate_content(
        prompt,
        generation_config = vertexai.generative_models.GenerationConfig(
            **genconfigs
            
        )
    )

    try:
        text = response.text
    except Exception:
        text = None

    counter = 0
    while not text:
        genconfigs['temperature'] = .5 - counter*.1
        response = model.generate_content(
            prompt,
            generation_config = vertexai.generative_models.GenerationConfig(
                **genconfigs

            )
        )
        try:
            text = response.text
        except Exception:
            text = None
            counter += 1

        if counter == 6:
            text = 'Please check the prompt, it appears the response is getting blocked.'
    
    return text

def generate_palm(prompt, genconfigs, model):
    response = model.predict(
        prompt,
        **genconfigs
    )
    
    return response.text

def get_generation(prompt, max_output_tokens, model):
    models = dict(GEMINI = gemini_text, PALM_BISON = text_model_b, PALM_BISION32k = text_model_b32, PALM_UNICORN = text_model_u)
    
    genconfigs = dict(max_output_tokens = max_output_tokens)
    
    if model == 'GEMINI':
        response = generate_gemini(prompt, genconfigs, models[model])
    else:
        response = generate_palm(prompt, genconfigs, models[model])
    
    return response

#### Presentation Functions

These prepare the response for presentation - and display the results.

In [None]:
# get a font to use for annotating the page images:
# get font for annotations: get fonts from fonts.google.com
font_source_url = "https://fonts.googleapis.com/css2?family=Roboto+Mono&display=swap"
font_source = requests.get(font_source_url).content.decode("utf-8")
start_url = font_source.find('url(')+4
end_url = font_source.find(')', start_url)
font_url = font_source[start_url:end_url]
font = PIL.ImageFont.truetype(io.BytesIO(requests.get(font_url).content), 35)

def get_presentation(question, contexts, DISTANCE, response, display_contexts, display_annotations):
    # repeat the question
    IPython.display.display(IPython.display.Markdown(f'**The Question:**\n\n{question}\n\n'))
    
    # show the answer
    IPython.display.display(IPython.display.Markdown(f'**The Response:**\n\n{response}\n\n'))
    
    if display_contexts:
    # display the contexts information: page, similarity, hyperlink
        context_pres = '**Sources:**\n\n'
        pages = []
        context_types = [c[4] for c in contexts]
        if DISTANCE > 0:
            context_pres += f'Note: The {len(contexts) - sum(context_types)} contexts were expanded to gather {sum(context_types)} additional chunks on pages with matches using a similarity distance of {DISTANCE}.\n'
        for context in contexts:
            page = next([d['parsing']['path'], d['parsing']['file'], d['parsing']['page'], d['file_index'], d['page_index']] for d in files_pages if d['file_index'] == context[3]['file_index'] and d['page_index'] == context[3]['page_index'])
            pages.append(page)
            if not context[4]:
                context_pres += f'1. {page[0]}{page[1]}#page={page[2]}\n\t* page: {page[2]}, similarity to question is {context[1]:.3f}\n'
            # the following is commented out, if uncommented it would also add the expanded contexts to printed list (this can be very long for DISTANCE = 1 which is the full page)
            #else:
            #    context_pres += f'1. {page[0]}{page[1]}#page={page[2]}\n\t* page: {page[2]}, similarity to primary context is {context[1]:.3f}\n'
        IPython.display.display(IPython.display.Markdown(context_pres))
        
    if display_annotations:
    # display each page with annotations
        IPython.display.display(IPython.display.Markdown('**Annotated Document Pages**\n\n'))
        # list of unique pages across contexts: sorted list of tuple(file_index, page_index)
        pages = sorted(list(set([(page[3], page[4]) for page in pages])), key = lambda x: (x[0], x[1]))
        # list of PIL images for each unique page
        images = []
        for page in pages:
            image = next(d['parsing']['pages'][0]['image']['content'] for d in files_pages if d['file_index'] == page[0] and d['page_index'] == page[1])
            images.append(
                PIL.Image.open(
                    io.BytesIO(
                        base64.decodebytes(
                            image.encode('utf-8')
                        )
                    )
                )
            )
        # annotate the contexts on the pages:
        for c, context in enumerate(contexts):
            image = images[pages.index((context[3]['file_index'], context[3]['page_index']))]
            vertices = context[3]['vertices']
            draw = PIL.ImageDraw.Draw(image)
            if not context[4]: 
                color = 'green'
                prefix = 'Source'
            else:
                color = 'blue'
                prefix = 'Expanded Source'
            draw.polygon([
                vertices[0]['x'], vertices[0]['y'],
                vertices[1]['x'], vertices[1]['y'],
                vertices[2]['x'], vertices[2]['y'],
                vertices[3]['x'], vertices[3]['y']
            ], outline = color, width = 5)
            draw.text(
                xy = (vertices[1]['x'], vertices[1]['y']), text = f"{prefix} {c+1}", fill = color, anchor = 'rd', font = font
            )
        
        for image in images:
            IPython.display.display(image.resize(tuple([int(.25*x) for x in image.size])))
            
    return

In [None]:
def document_bot(question, max_output_tokens = 1000, DISTANCE = 0, MODEL = 'GEMINI', display_contexts = False, display_annotations = False, ground = True):
    # this function directly references (without input): font
    # DISTANCE = .1 # float in [0, 1], 0 return no additional context, 1 return all on unique pages
    # MODEL = 'GEMINI' # one of: GEMINI, PALM_BISON, PALM_BISON32K, PALM_UNICORN
    
    # R: Retrival
    if ground:
        contexts = get_retrieval(question, DISTANCE = DISTANCE)
    else:
        contexts = []
        
    # A: Augemented
    prompt = get_augmented(question, contexts, ground)
    
    # G: Generation
    response = get_generation(prompt, max_output_tokens, MODEL)
    
    # Present Answer
    get_presentation(question, contexts, DISTANCE, response, display_contexts, display_annotations)
    
    return prompt

### Try Out Document_Bot:

#### The Question

In [None]:
question

#### Get Ungrounded Response

In [None]:
prompt = document_bot(question, ground = False)

#### Get Grounded Response

In [None]:
prompt = document_bot(question)

#### Get Grounded Response, Print Out Contexts

In [None]:
prompt = document_bot(question, display_contexts = True)

#### Get Grounded Response, Print Out Contexts And Annotated Pages

In [None]:
prompt = document_bot(question, display_contexts = True, display_annotations = True)

#### Get Grounded Response With Expanded Contexts, Print Out Contexts And Annotated Pages

In [None]:
prompt = document_bot(question, DISTANCE = 1, display_contexts = True, display_annotations = True)

#### Get Grounded Response From Out Of Context Questions

Even though the question is on topic for the documents, baseball, it is about specific players and teams which is out of context of the indexed documents which are about the rules of the game of baseball.

In [None]:
prompt = document_bot('Who is the best pitcher for the Dodgers?')

### Harder Questions

In [None]:
prompt = document_bot("What is the definition of a balk?", DISTANCE = 0.5)

In [None]:
prompt = document_bot("Is a rule broken if three infielders are positioned on the same side of the field where the batter is more likely to hit the ball?", DISTANCE = 0.5)

In [None]:
prompt = document_bot("A batter hits a fair ball that goes over the outfield fence. Is this always a home run?")

In [None]:
prompt = document_bot("Describe the shape of bases.")

---
## Next Steps!

While the document bots performs well and the configuation of `DISTANCE` allows users to gather expanded contexts, it still can be improved.

These enhancements will follow in additional notebooks:
- Enhanced context retrieval with a tree of contexts
    - This will provided a much better context for general questions that infer information from a broad section of the document. Like "How do I play baseball?"
- Multi-modal prompts
    - By included images from documents in the context retrieval and the prompt the context can also be inferred from the images.  This will help with the question about bases where the primary description in the rules is a graphic with dimensions.

In [77]:
len(list(bucket.list_blobs(prefix = f'{SERIES}/{EXPERIMENT}/files_pages.json')))

1

In [78]:
if USE_PRIOR_RUN == False:
    PRIOR_PARSE = False
    
    # do a check for prior run and present message if found letting user know the prior result exists but not being used
    if RETRIEVE_FROM == 'GCS' and len(list(bucket.list_blobs(prefix = f'{SERIES}/{EXPERIMENT}/files_pages.json'))) > 0:
        print(f'Previous results exists in GCS but forcing the creation of new parsing with USE_PRIOR_RUN = {USE_PRIOR_RUN}')
    elif RETRIEVE_FROM == 'BQ' and bq_table_check(f'{BQ_DATASET}.{BQ_TABLE}_files_pages'):
        print(f'Previous results exists in BQ but forcing the creation of new parsing with USE_PRIOR_RUN = {USE_PRIOR_RUN}')

elif RETRIEVE_FROM == 'GCS' and len(list(bucket.list_blobs(prefix = f'{SERIES}/{EXPERIMENT}/files_pages.json'))) > 0:
    print(f'Detected {SERIES}/{EXPERIMENT}/files_pages.json')
    print('Importing previous run from GCS')

    # load files_pages: the file+page level information including docai responses in `parsing`
    blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages.json')
    files_pages = [json.loads(line) for line in blob.download_as_text().splitlines()]
    print(f'Loaded {SERIES}/{EXPERIMENT}/files_pages.json')
    # load files_pages_chunks: the chunks parsed from the files+pages
    blob = bucket.blob(f'{SERIES}/{EXPERIMENT}/files_pages_chunks.json')
    files_pages_chunks = [json.loads(line) for line in blob.download_as_text().splitlines()]   
    print(f'Loaded {SERIES}/{EXPERIMENT}/files_pages_chunks.json')
    # Set Indicator to prevent redoing the parsing later in this notebook
    PRIOR_PARSE = True
    print(f'PRIOR_PARSE: {PRIOR_PARSE}')

elif RETRIEVE_FROM == 'BQ' and bq_table_check(f'{BQ_DATASET}.{BQ_TABLE}_files_pages'):
    print('Importing previous run from BigQuery')

    # load files_pages: the file+page level information including docai responses in `parsing`
    files_pages = bq.query(f'SELECT * FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_files_pages` ORDER BY file_index, page_index').to_dataframe().to_dict('records')
    # convert json string to dictionary:
    for page in files_pages:
        page['parsing'] = json.loads(page['parsing'])
    
    # load files_pages_chunks: the chunks parsed from the files+pages
    files_pages_chunks = bq.query(f'SELECT * FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}_files_pages_chunks`').to_dataframe().to_dict('records')
    #convert json string to dictionary:
    for chunk in files_pages_chunks:
        chunk['metadata'] = json.loads(chunk['metadata'])
    # sort chunk by file, page, chunk number:
    files_pages_chunks = sorted(files_pages_chunks, key = lambda x: (x['metadata']['file_index'], x['metadata']['page_index'], x['metadata']['chunk']))

    # Set Indicator to prevent redoing the parsing later in this notebook
    PRIOR_PARSE = True
        
else:
    print('No previous run available to import')
    PRIOR_PARSE = False

Detected applied-genai-v3/mlb-rules/files_pages.json
Importing previous run from GCS
Loaded applied-genai-v3/mlb-rules/files_pages.json
Loaded applied-genai-v3/mlb-rules/files_pages_chunks.json
PRIOR_PARSE: True
