# Embedding semantic search for issue resolution

In [None]:
import sys
import os

In [None]:
sys.path.append(os.path.dirname(os.getcwd()))
from utils import run_query, load_constants

In [None]:
constants = load_constants()

GOOGLE_CLOUD_PROJECT = constants["GCP"]["GOOGLE_CLOUD_PROJECT"]
GOOGLE_CLOUD_LOCATION = constants["GCP"]["GOOGLE_CLOUD_LOCATION"]
GOOGLE_CLOUD_LOCATION_MULTI_REGION = constants["GCP"]["GOOGLE_CLOUD_LOCATION_MULTI_REGION"]
GOOGLE_CLOUD_GCS_BUCKET = constants["GCP"]["GOOGLE_CLOUD_GCS_BUCKET"]
GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION = constants["GCP"][
    "GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION"
]
GOOGLE_GEMINI_MODEL_15 = constants["VERTEX"]["GOOGLE_GEMINI_MODEL_15"]
GOOGLE_GEMINI_MODEL_10 = constants["VERTEX"]["GOOGLE_GEMINI_MODEL_10"]

GOOGLE_CLOUD_BIGQUERY_PROJECT = constants["BIGQUERY"]["GOOGLE_CLOUD_BIGQUERY_PROJECT"]
GOOGLE_CLOUD_BIGQUERY_DATASET = constants["BIGQUERY"]["GOOGLE_CLOUD_BIGQUERY_DATASET"]
GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION = constants["BIGQUERY"][
    "GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION"
]


BASE_TABLE_NAME_EVENTS = constants["BIGQUERY"]["BASE_TABLE_NAME_EVENTS"]
BASE_TABLE_NAME_INCIDENTS = constants["BIGQUERY"]["BASE_TABLE_NAME_INCIDENTS"]

DOC_AI_PROCESSOR_URI = constants["DOC_AI"]["DOC_AI_PROCESSOR_URI"]

In [None]:
query_cext = f"""CREATE OR REPLACE EXTERNAL TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs`
  WITH CONNECTION `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_LOCATION_MULTI_REGION}.genai`
  OPTIONS (
    object_metadata = 'SIMPLE',
    uris = ['gs://{GOOGLE_CLOUD_GCS_BUCKET_MULTI_REGION}/rca/*'],
    metadata_cache_mode= 'AUTOMATIC',
    max_staleness= INTERVAL 1 HOUR
  );"""

In [None]:
query_cmodel = f"""
  CREATE OR REPLACE MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.rca_processor`
  REMOTE WITH CONNECTION `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_LOCATION_MULTI_REGION}.genai`
  OPTIONS (
    remote_service_type = 'CLOUD_AI_DOCUMENT_V1',   
    document_processor='f{DOC_AI_PROCESSOR_URI}'
  );"""

In [None]:
run_query(query_cext)

In [None]:
run_query(query_cmodel)

In [None]:
query_parse = f"""
  CREATE OR REPLACE TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs_parsed` AS
  SELECT *
  FROM ML.PROCESS_DOCUMENT(
    MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.rca_processor`,
    TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs`)
  WHERE content_type = 'application/pdf';"""

In [None]:
run_query(query_parse)

In [None]:
query_emodel = f"""
CREATE OR REPLACE MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gecko_embedder`
  REMOTE WITH CONNECTION `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_LOCATION_MULTI_REGION}.genai`
  OPTIONS (ENDPOINT = "textembedding-gecko-multilingual");"""

In [None]:
run_query(query_emodel)

In [None]:
query_genembs = f"""
CREATE OR REPLACE TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs_embedded` AS
SELECT * FROM ML.GENERATE_EMBEDDING(
  MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gecko_embedder`,
  (
    SELECT  JSON_VALUE(ml_process_document_result, '$.text') AS content, uri as title
    FROM `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs_parsed`
  )
)
WHERE LENGTH(ml_generate_embedding_status) = 0;"""

In [None]:
run_query(query_genembs)

In [None]:
query_emodel = f"""
CREATE OR REPLACE MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gemini_model`
  REMOTE WITH CONNECTION `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_LOCATION_MULTI_REGION}.genai`
  OPTIONS (ENDPOINT = 'f{GOOGLE_GEMINI_MODEL_10}');"""

In [None]:
run_query(query_emodel)

In [None]:
user_query = 'Im having a high CPU utilization incident together with  Network Congestion Alert and High Active Connection Count Alert'
query_search = f"""
SELECT *
FROM VECTOR_SEARCH(
  TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs_embedded`, 'ml_generate_embedding_result',
  (
  SELECT ml_generate_embedding_result, content AS query
  FROM ML.GENERATE_EMBEDDING(
   MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gecko_embedder`,
  (SELECT '{user_query}' AS content))
  ),
  top_k => 5);"""

In [None]:
user_query = "Im having a high CPU utilization incident together with  Network Congestion Alert and High Active Connection Count Alert"
query_rag = f"""SELECT ml_generate_text_result.candidates[0].content.parts[0].text
FROM ML.GENERATE_TEXT(
  MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gemini_model`,
  (
    SELECT CONCAT(
      'Detail how to solve the issue using the following articles, produce a step by step guide ',
      STRING_AGG(base.content)
      ) AS prompt,
    FROM VECTOR_SEARCH(
  TABLE `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.{BASE_TABLE_NAME_INCIDENTS}_docs_embedded`, 'ml_generate_embedding_result',
  (
  SELECT ml_generate_embedding_result, content AS query
  FROM ML.GENERATE_EMBEDDING(
   MODEL `{GOOGLE_CLOUD_BIGQUERY_DATASET_MULTI_REGION}.gecko_embedder`,
  (SELECT '{user_query}' AS content))
  ),
  top_k => 10)
  ));"""

In [None]:
print(query_rag)

In [None]:
print(run_query(query_rag)['text'].iloc[0])