In [42]:
import os
# Import Google Cloud and Vertex AI libraries
from google.cloud import bigquery
import vertexai
import vertexai.language_models
import vertexai.preview.generative_models

# Set environment variables for Google Cloud credentials and TensorFlow logging level
GOOGLE_APPLICATION_CREDENTIALS = "/Users/zacharynguyen/Documents/GitHub/2024/Applied-Generative-AI/IAM/zacharynguyen-genai-656c475b142a.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Define Google Cloud project and region information
PROJECT_ID = 'zacharynguyen-genai'
REGION = 'us-central1'
EXPERIMENT = 'sdoh_cdc_wonder_natality'
SERIES = 'applied-genai'

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=REGION)

# Initialize the BigQuery client
bq = bigquery.Client(project=PROJECT_ID)

# Initialize Vertex AI models
textgen_model = vertexai.language_models.TextGenerationModel.from_pretrained('text-bison@002')
codegen_model = vertexai.language_models.CodeGenerationModel.from_pretrained('code-bison@002')
gemini_model = vertexai.preview.generative_models.GenerativeModel("gemini-pro")

# BigQuery constants for querying
BQ_PROJECT = 'bigquery-public-data'
BQ_DATASET = 'sdoh_cdc_wonder_natality'
BQ_TABLES = ['county_natality', 'county_natality_by_mother_race', 'county_natality_by_father_race']

# Construct and execute a BigQuery query to retrieve schema columns
query = f"""
    SELECT * EXCEPT(field_path, collation_name, rounding_mode)
    FROM `{BQ_PROJECT}.{BQ_DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ({','.join([f'"{table}"' for table in BQ_TABLES])})
"""
schema_columns = bq.query(query=query).to_dataframe()

# Define the question for the model
question = "Which mother's single race category reports the highest average number of births and what is that number?"


In [43]:
def initial_query(question, schema_columns):
    
    # code generation model
    codegen_model = vertexai.language_models.CodeGenerationModel.from_pretrained('code-bison@002')
    
    # initial request for query:
    context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

    context_query = codegen_model.predict(context_prompt, max_output_tokens = 256)
    
    # extract query from response
    if context_query.text.find("```") >= 0:
        context_query = context_query.text.split("```")[1]
        if context_query.startswith('sql'):
            context_query = context_query[3:]
        print('Initial Query:\n', context_query)
    else:
        print('No query provided (first try) - unforseen error, printing out response to help with editing this funcntion:\n', query_response.text)
    
    return context_query  

In [44]:
initial_query(question,schema_columns)

Initial Query:
 
SELECT
  Mothers_Single_Race,
  AVG(Births) AS average_births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`
GROUP BY
  Mothers_Single_Race
ORDER BY
  average_births DESC
LIMIT 1;


'\nSELECT\n  Mothers_Single_Race,\n  AVG(Births) AS average_births\nFROM\n  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`\nGROUP BY\n  Mothers_Single_Race\nORDER BY\n  average_births DESC\nLIMIT 1;\n'

In [45]:
def codechat_start(question, query, schema_columns):

    # code chat model
    codechat_model = vertexai.language_models.CodeChatModel.from_pretrained('codechat-bison@002')
    
    # start a code chat session and give the schema for columns as the starting context:
    codechat = codechat_model.start_chat(
        context = f"""
The BigQuery Environment has tables defined by the follow schema:
{schema_columns.to_markdown(index = False)}

This session is trying to troubleshoot a Google BigQuery SQL query that is being writen to answer the question:
{question}

BigQuery SQL query that needs to be fixed:
{query}

Instructions:
As the user provides versions of the query and the errors returned by BigQuery, offer suggestions that fix the errors but it is important that the query still answer the original question.
"""
    )
    
    return codechat

In [46]:
def fix_query(query, max_fixes):
    
    # iteratively run query, and fix it using codechat until success (or max_fixes reached):
    fix_tries = 0
    answer = False
    while fix_tries < max_fixes:
        if not query: 
            return
        # run query:
        query_job = bq.query(query = query)
        # if errors, then generate repair query:
        if query_job.errors:
            fix_tries += 1
            
            if fix_tries == 1:
                codechat = codechat_start(question, query, schema_columns)
            
            # construct hint from error
            hint = ''
            for error in query_job.errors:
                # detect error message
                if 'message' in list(error.keys()):
                    # detect index of error location
                    if error['message'].rindex('[') and error['message'].rindex(']'):
                        begin = error['message'].rindex('[') + 1
                        end = error['message'].rindex(']')
                        # verify that it looks like an error location:
                        if end > begin and error['message'][begin:end].index(':'):
                            # retrieve the two parts of the error index: query line, query column
                            query_index = [int(q) for q in error['message'][begin:end].split(':')]
                            hint += query.split('\n')[query_index[0]-1].strip()
                            break
            
            # construct prompt to request a fix:
            fix_prompt = f"""This query:\n{query}\n\nReturns these errors:\n{query_job.errors}"""

            if hint != '':
                fix_prompt += f"""\n\nHint, the error appears to be in this line of the query:\n{hint}"""            
            
            query_response = codechat.send_message(fix_prompt)
            query_response = codechat.send_message('Respond with only the corrected query that still answers the question as a markdown code block.')
            if query_response.text.find("```") >= 0:
                query = query_response.text.split("```")[1]
                if query.startswith('sql'):
                    query = query[4:]
                print(f'Fix #{fix_tries}:\n', query)
            # response did not have a query????:
            else:
                query = ''
                print('No query in response...')

        # no error, break while loop
        else:
            break
    
    return query, query_job, fix_tries, codechat

In [47]:
def answer_question(question, query_job):

    # text generation model
    gemini_model = vertexai.preview.generative_models.GenerativeModel("gemini-pro")

    # answer question
    result = query_job.to_dataframe()
    question_prompt = f"""
Please derive insights from: {question}
Utilize the statistics from the BigQuery table relevant to this inquiry. Emphasize crucial discoveries and their implications for strategic actions, ensuring to mention specific statistics where possible. Avoid repeating the question or detailing the dataset's context.

Use this data:
{result.to_markdown(index = False)}
    """

    question_response = gemini_model.generate_content(question_prompt)
    
    return question_response.text

In [48]:
def BQ_QA(question, max_fixes = 10, schema_columns = schema_columns):
    
    # generate query
    query = initial_query(question, schema_columns)
    
    # run query:
    query_job = bq.query(query = query)
    # if errors, then generate repair query:
    if query_job.errors:
        print('found errors')
        query, query_job, fix_tries, codechat = fix_query(query, max_fixes)
    
    # respond with outcome:
    if query_job.errors:
        print(f'No answer generated after {fix_tries} tries.')
        return codechat
    else:
        question_response = answer_question(question, query_job)
        print(question_response)
        try:
            return codechat
        except:
            return None

In [49]:
session = BQ_QA("Which mother's single race category reports the highest average number of births and what is that number?")

Initial Query:
 
SELECT
  Mothers_Single_Race,
  AVG(Births) AS average_births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`
GROUP BY
  Mothers_Single_Race
ORDER BY
  average_births DESC
LIMIT 1;
**Key Insight:**

White mothers have the highest average number of births at 4528.37.

**Implications for Strategic Actions:**

* **Targeted Healthcare Programs:** Healthcare providers can design specific programs and interventions tailored to the needs of White mothers with multiple births, considering factors such as prenatal care, postpartum support, and access to resources.
* **Maternal Health Data Collection:** Collecting more detailed data on the single-race category of mothers can help identify disparities in birth outcomes and inform evidence-based policy decisions.
* **Community Outreach and Support:** Community-based organizations can prioritize outreach and support services for White mothers with multiple births, providing resources and educat

In [58]:
session = BQ_QA("Which county has the highest birth rates in Tenneesee?")

Initial Query:
 
SELECT
  county_natality.County_of_Residence,
  county_natality.Births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.County_of_Residence LIKE "%Tennessee%"
ORDER BY
  county_natality.Births DESC
LIMIT 1;
found errors
Fix #1:
 SELECT
  county_natality.County_of_Residence,
  county_natality.Births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality` AS county_natality
WHERE
  county_natality.County_of_Residence LIKE "%Tennessee%"
ORDER BY
  county_natality.Births DESC
LIMIT 1;
Shelby County has the highest birth rates in Tennessee with 10,000 births.

Insights:

- **Increased Healthcare Demand:** The high birth rates imply a need for expanded healthcare services, including prenatal care, labor and delivery services, and neonatal care. Strategic actions should focus on increasing healthcare capacity in Shelby County.
- **Educational and Social Services:** The large number of births also highlights the need

In [51]:
session = BQ_QA ("How does the average pre-pregnancy BMI vary by county?")

Initial Query:
 
SELECT
  county_natality.County_of_Residence,
  AVG(county_natality.Ave_Pre_pregnancy_BMI) AS average_pre_pregnancy_bmi
FROM
  bigquery-public-data.sdoh_cdc_wonder_natality.county_natality
GROUP BY
  county_natality.County_of_Residence
ORDER BY
  average_pre_pregnancy_bmi DESC;
**Significant Discoveries and Their Implications**

- **Significant geographical disparities in pre-pregnancy BMI:** Counties within the same state can have significantly different average pre-pregnancy BMIs. For example, Oswego County, NY, has a pre-pregnancy BMI of 29.1667, while Livingston County, MI, has a pre-pregnancy BMI of 26.5367. This highlights the need for tailored approaches to addressing obesity and its associated health risks in different regions.

- **Counties with high pre-pregnancy BMI tend to cluster in certain areas:** There are clusters of counties with high pre-pregnancy BMI in the southeastern United States, the Midwest, and parts of the West Coast. This suggests that fact

In [60]:
session = BQ_QA("Are there significant differences in the outcomes (birth weight, gestational age) for different races of mothers?")

Initial Query:
 
WITH MotherRace AS (
  SELECT
    'All Races' AS Mother_Race,
    AVG(Ave_Birth_Weight_gms) AS Avg_Birth_Weight_All,
    AVG(Ave_OE_Gestational_Age_Wks) AS Avg_OE_Gestational_Age_All,
    AVG(Ave_LMP_Gestational_Age_Wks) AS Avg_LMP_Gestational_Age_All
  FROM
    `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`
),
WhiteMothers AS (
  SELECT
    'White' AS Mother_Race,
    AVG(Ave_Birth_Weight_gms) AS Avg_Birth_Weight_White,
    AVG(Ave_OE_Gestational_Age_Wks) AS Avg_OE_Gestational_Age_White,
    AVG(Ave_LMP_Gestational_Age_Wks) AS Avg_LMP_Gestational_Age_White
  FROM
    `bigquery-public-data.sdoh_cdc_wonder_natality.county
found errors
Fix #1:
 WITH MotherRace AS (
  SELECT
    'All Races' AS Mother_Race,
    AVG(Ave_Birth_Weight_gms) AS Avg_Birth_Weight_All,
    AVG(Ave_OE_Gestational_Age_Wks) AS Avg_OE_Gestational_Age_All,
    AVG(Ave_LMP_Gestational_Age_Wks) AS Avg_LMP_Gestational_Age_All
  FROM
    `bigquery-public-data.sdoh_cdc_wonder