# Intelligent Query Crafting with BigQuery Context: Leveraging Vertex GenAI for Enhanced Business Insights on GCP

In [1]:
import os
GOOGLE_APPLICATION_CREDENTIALS = "/Users/zacharynguyen/Documents/GitHub/2024/Applied-Generative-AI/IAM/zacharynguyen-genai-656c475b142a.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS

In [2]:
PROJECT_ID='zacharynguyen-genai'
REGION = 'us-central1'
EXPERIMENT = 'sdoh_cdc_wonder_natality'
SERIES = 'applied-genai'

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import vertexai.language_models
import vertexai.preview.generative_models
from google.cloud import bigquery

In [4]:
# vertex ai clients
vertexai.init(project = PROJECT_ID, location = REGION)
# bigquery client
bq = bigquery.Client(project = PROJECT_ID)

## GOAL

In [19]:
question = "Which county has the highest number of births?"

##  Vertex LLM Setup

In [20]:
# create links to model: embedding api and text generation
textgen_model = vertexai.language_models.TextGenerationModel.from_pretrained('text-bison@002')
codegen_model = vertexai.language_models.CodeGenerationModel.from_pretrained('code-bison@002')
gemini_model = vertexai.preview.generative_models.GenerativeModel("gemini-pro")

In [21]:
textgen_model.predict(f"Write a Google SQL query that answers the following question.\nquestion: {question}")

 ```sql
SELECT county, COUNT(*) AS num_births
FROM births
GROUP BY county
ORDER BY num_births DESC
LIMIT 1;
```

In [22]:
codegen_model.predict(f"Write a Google SQL query that answers the following question.\nquestion: {question}")

```sql
SELECT county, COUNT(*) AS num_births
FROM births
GROUP BY county
ORDER BY num_births DESC
LIMIT 1;
```

In [23]:
print(gemini_model.generate_content(f"Write a Google SQL query that answers the following question.\nquestion: {question}", generation_config = dict(temperature=0)).text)

```sql
SELECT county, COUNT(*) AS num_births
FROM births
GROUP BY county
ORDER BY num_births DESC
LIMIT 1;
```


### The Challenge

While Large Language Models (LLMs) are capable of generating SQL queries, this process encounters notable obstacles:

- The queries often fail to target the appropriate tables.
- Incorrect column names are frequently used, not matching those in the intended tables.
- Essentially, the SQL produced serves more as an initial draft, requiring further refinement by the user to accurately fetch the necessary data in response to their query.

### Achieving Executable SQL Queries via LLM

To address these issues and generate fully operational SQL queries, a methodical strategy was developed through the repeated optimization of text prompts and methodologies tailored to distinct inquiries.

### Retrieve Table Schemas

In [24]:
BQ_PROJECT = 'bigquery-public-data'
BQ_DATASET = 'sdoh_cdc_wonder_natality'
BQ_TABLES = ['county_natality', 'county_natality_by_mother_race', 'county_natality_by_father_race']

In [25]:

query = f"""
    SELECT * EXCEPT(field_path, collation_name, rounding_mode)
    FROM `{BQ_PROJECT}.{BQ_DATASET}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ({','.join([f'"{table}"' for table in BQ_TABLES])})
"""
print(query)
schema_columns = bq.query(query = query).to_dataframe()


    SELECT * EXCEPT(field_path, collation_name, rounding_mode)
    FROM `bigquery-public-data.sdoh_cdc_wonder_natality.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
    WHERE table_name in ("county_natality","county_natality_by_mother_race","county_natality_by_father_race")


In [26]:
schema_columns.shape

(34, 6)

In [27]:
schema_columns.head()

Unnamed: 0,table_catalog,table_schema,table_name,column_name,data_type,description
0,bigquery-public-data,sdoh_cdc_wonder_natality,county_natality,Year,DATE,Year
1,bigquery-public-data,sdoh_cdc_wonder_natality,county_natality,County_of_Residence,STRING,County of Residence
2,bigquery-public-data,sdoh_cdc_wonder_natality,county_natality,County_of_Residence_FIPS,STRING,County of Residence Code
3,bigquery-public-data,sdoh_cdc_wonder_natality,county_natality,Births,INT64,Number of Births
4,bigquery-public-data,sdoh_cdc_wonder_natality,county_natality,Ave_Age_of_Mother,FLOAT64,Average Age of Mother (years)


In [28]:
schema_columns.to_markdown

<bound method DataFrame.to_markdown of            table_catalog              table_schema  \
0   bigquery-public-data  sdoh_cdc_wonder_natality   
1   bigquery-public-data  sdoh_cdc_wonder_natality   
2   bigquery-public-data  sdoh_cdc_wonder_natality   
3   bigquery-public-data  sdoh_cdc_wonder_natality   
4   bigquery-public-data  sdoh_cdc_wonder_natality   
5   bigquery-public-data  sdoh_cdc_wonder_natality   
6   bigquery-public-data  sdoh_cdc_wonder_natality   
7   bigquery-public-data  sdoh_cdc_wonder_natality   
8   bigquery-public-data  sdoh_cdc_wonder_natality   
9   bigquery-public-data  sdoh_cdc_wonder_natality   
10  bigquery-public-data  sdoh_cdc_wonder_natality   
11  bigquery-public-data  sdoh_cdc_wonder_natality   
12  bigquery-public-data  sdoh_cdc_wonder_natality   
13  bigquery-public-data  sdoh_cdc_wonder_natality   
14  bigquery-public-data  sdoh_cdc_wonder_natality   
15  bigquery-public-data  sdoh_cdc_wonder_natality   
16  bigquery-public-data  sdoh_cdc_wonder_n

### Code Generation LLM - With Context

In [29]:
print(question)

Which county has the highest number of births?


In [30]:
context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

#context_query = textgen_model.predict(f"Write a Google SQL query that answers the following question.\nquestion: {question}")
#context_query = codegen_model.predict(context_prompt, max_output_tokens = 500)
context_query = gemini_model.generate_content(context_prompt, generation_config = dict(temperature = 0))

print(context_query.text)

```sql
SELECT
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.County_of_Residence,
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.Births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
ORDER BY
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.Births DESC
LIMIT 1;
```


In [31]:
context_query = query = '\n'.join(context_query.text.split('\n')[1:-1])
print(context_query)

SELECT
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.County_of_Residence,
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.Births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
ORDER BY
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.Births DESC
LIMIT 1;


In [32]:
dry_run = bq.query(context_query, job_config = bigquery.QueryJobConfig(dry_run = True, use_query_cache = False))

In [33]:
dry_run.errors

In [34]:
dry_run.total_bytes_processed

54174

In [35]:
context_response = bq.query(context_query).to_dataframe()
context_response

Unnamed: 0,County_of_Residence,Births
0,"Los Angeles County, CA",123092


### Answer The Question
Now that a valid context has been retrieved from BigQuery it can be passed to a text generation LLM to answer the user questions.

In [37]:
question_prompt = f"""
context (result from BigQuery query):
{context_response.to_markdown(index = False)}

Answer the following question using the provided context.  Note that the context is a tabular result returned from a BigQuery query.  Do not repeat the question or the context when responding.
{question}
"""

question_response = textgen_model.predict(question_prompt)

print(question_response.text)

 Los Angeles County, CA


### Put It All Together
Ask a new question and try it out:

In [46]:
question = 'Which county has the highest number of births??'

In [47]:
context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

In [48]:
#context_query = textgen_model.predict(f"Write a Google SQL query that answers the following question.\nquestion: {question}")
#context_query = codegen_model.predict(context_prompt, max_output_tokens = 500)
context_query = gemini_model.generate_content(context_prompt, generation_config = dict(temperature = 0))
context_query

candidates {
  content {
    role: "model"
    parts {
      text: "```sql\nSELECT\n  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.County_of_Residence,\n  SUM(`bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.Births) AS Total_Births\nFROM\n  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`\nGROUP BY\n  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`.County_of_Residence\nORDER BY\n  Total_Births DESC\nLIMIT 1;\n```"
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
    probability_score: 0.351867646
    severity: HARM_SEVERITY_NEGLIGIBLE
    severity_score: 0.140336245
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
    probability_score: 0.294925213
    severity: HARM_SEVERITY_LOW
    severity_score: 0.213213295
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
    pr

In [49]:
context_response = bq.query(query = '\n'.join(context_query.text.split('\n')[1:-1])).to_dataframe()

In [50]:
print(context_response.to_markdown(index = False))

| County_of_Residence    |   Total_Births |
|:-----------------------|---------------:|
| Los Angeles County, CA |         350313 |


In [51]:
question_prompt = f"""
context (result from BigQuery query):
{context_response.to_markdown(index = False)}

Answer the following question.  Note that the context is a tabular result returned from a BigQuery query.  Do not repeat the question or the context when responding.
{question}
"""

question_response = textgen_model.predict(question_prompt)

print(question_response.text)

 Los Angeles County, CA


### More Complex

In [52]:
question = 'What is the average age of mothers giving birth?'

In [53]:
context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

#context_query = textgen_model.predict(f"Write a Google SQL query that answers the following question.\nquestion: {question}")
#context_query = codegen_model.predict(context_prompt, max_output_tokens = 500, temperature = 0.9)
context_query = gemini_model.generate_content(context_prompt, generation_config = dict(temperature = 0))

context_query = '\n'.join(context_query.text.split('\n')[1:-1])
context_response = bq.query(query = context_query).to_dataframe()

In [54]:

print(context_query)

SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`;


In [55]:
context_response

Unnamed: 0,average_age_of_mothers
0,28.695543


In [56]:
question_prompt = f"""
Answer the following question.  Note that the context above is a tabular result returned from a BigQuery query specific to this question.  Do not repeat the question or the context when responding.
{question}

Use this data:
{context_response.to_markdown(index = False)}
"""

question_response = gemini_model.generate_content(question_prompt)

print(question_response.text)

28.6955


### All Together and More Complex

In [66]:
question = 'What is the average age of mothers giving birth?'

In [67]:
context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}
Note: The 'year' column adheres to the 'YYYY-MM-DD' format. Exercise caution when extracting the year, month, and day components.
Craft a SQL query for Google BigQuery, ensuring the use of fully qualified table names to address the following inquiry:
{question}
"""

context_query = gemini_model.generate_content(context_prompt, generation_config = dict(temperature = 0))

context_query = '\n'.join(context_query.text.split('\n')[1:-1])
context_query
#context_response = bq.query(query = context_query).to_dataframe()

'SELECT\n  AVG(Ave_Age_of_Mother) AS average_age_of_mothers\nFROM\n  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`;'

In [68]:
context_response

Unnamed: 0,average_age_of_mothers
0,28.695543


In [69]:
question_prompt = f"""
Answer the following question.  Note that the context above is a tabular result returned from a BigQuery query specific to this question.  Do not repeat the question or the context when responding.
{question}

Use this data:
{context_response.to_markdown(index = False)}
"""

question_response = gemini_model.generate_content(question_prompt)

print(question_response.text)

28.69


### Detect Error

In [70]:
question = 'What is the average age of mothers giving birth in the year of 2018?'

In [71]:
context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

context_query = codegen_model.predict(context_prompt, max_output_tokens = 256)

In [72]:
context_query

```sql
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';
```

In [73]:
context_query = '\n'.join(context_query.text.split('\n')[1:-1])
print(context_query)

SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';


In [74]:
try:
    bq.query(context_query, job_config = bigquery.QueryJobConfig(dry_run = True, use_query_cache = False))
except Exception as err:
    errors = f"{type(err).__name__} was raised: {err}"
    print(errors)

BadRequest was raised: 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/zacharynguyen-genai/jobs?prettyPrint=false: Unrecognized name: county_natality at [6:3]

Location: None
Job ID: 3c676027-4799-4911-a523-6a426ec68079


In [75]:
query_job = bq.query(query = context_query)

In [76]:

type(query_job)

google.cloud.bigquery.job.query.QueryJob

In [77]:

query_job.errors

[{'reason': 'invalidQuery',
  'location': 'query',
  'message': 'Unrecognized name: county_natality at [6:3]'}]

### Use A Chat Model To Iteratively Refine A Query:

In [78]:
codechat_model = vertexai.language_models.CodeChatModel.from_pretrained('codechat-bison@002')

In [79]:
codechat = codechat_model.start_chat(
    context = f"""
The BigQuery Environment has tables defined by the follow schema:
{schema_columns.to_markdown(index = False)}

This session is trying to troubleshoot a Google BigQuery SQL query that is being writen to answer the question:
{question}

BigQuery SQL query that needs to be fixed:
{context_query}

Instructions:
As the user provides versions of the query and the errors returned by BigQuery, offer suggestions that fix the errors but it is important that the query still answer the original question.
"""
)

In [80]:
errors

'BadRequest was raised: 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/zacharynguyen-genai/jobs?prettyPrint=false: Unrecognized name: county_natality at [6:3]\n\nLocation: None\nJob ID: 3c676027-4799-4911-a523-6a426ec68079\n'

In [81]:
hint = ''
if errors.rindex('[') and errors.rindex(']'):
    begin = errors.rindex('[') + 1
    end = errors.rindex(']')
    if end > begin and errors[begin:end].index(':'):
        query_index = [int(q) for q in errors[begin:end].split(':')]
        hint += context_query.split('\n')[query_index[0] - 1].strip()
print(hint)

county_natality.Year = '2018';


In [82]:
fix_prompt = f"""
This query:
{context_query}

Returns these errors:
{query_job.errors}
"""

if hint != '':
    fix_prompt += f"""
Hint, the error appears to be in this line of the query:
{hint}
"""
print(fix_prompt)


This query:
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';

Returns these errors:
[{'reason': 'invalidQuery', 'location': 'query', 'message': 'Unrecognized name: county_natality at [6:3]'}]

Hint, the error appears to be in this line of the query:
county_natality.Year = '2018';


In [83]:
response = codechat.send_message(fix_prompt)

In [84]:
response

 To fix this error, the table alias `county_natality` needs to be added to the `Year` column. The corrected line should be:

```sql
county_natality.Year = '2018'
```

The updated query should look like this:

```sql
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';
```

This should resolve the error and allow the query to run successfully.

In [85]:
context_query = codechat.send_message(f'Respond with only the corrected query as a markdown code block.')

In [86]:
context_query

 ```sql
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';
```

In [87]:
if context_query.text.find("```") >= 0:
    context_query = response.text.split("```")[1]
    if context_query.startswith('sql'): context_query = context_query[3:]
    print(context_query)
else:
    print('no query in response')


county_natality.Year = '2018'


In [88]:
query_job = bq.query(query = context_query)

In [89]:
query_job.errors

[{'reason': 'invalidQuery',
  'location': 'query',
  'message': 'Syntax error: Unexpected identifier "county_natality" at [2:1]'}]

In [90]:
fix_prompt = f"""
This query:
{context_query}

Returns these errors:
{query_job.errors}
"""

if hint != '':
    fix_prompt += f"""
Hint, the error appears to be in this line of the query:
{hint}
"""
print(fix_prompt)


This query:

county_natality.Year = '2018'


Returns these errors:
[{'reason': 'invalidQuery', 'location': 'query', 'message': 'Syntax error: Unexpected identifier "county_natality" at [2:1]'}]

Hint, the error appears to be in this line of the query:
county_natality.Year = '2018';


In [91]:
response = codechat.send_message(fix_prompt)

In [92]:
response

 To fix this error, the `county_natality` table alias needs to be added to the beginning of the line. The corrected line should be:

```sql
county_natality.Year = '2018'
```

The updated query should look like this:

```sql
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';
```

This should resolve the error and allow the query to run successfully.

# FIX

In [93]:
def initial_query(question, schema_columns):
    
    # code generation model
    codegen_model = vertexai.language_models.CodeGenerationModel.from_pretrained('code-bison@002')
    
    # initial request for query:
    context_prompt = f"""
context (BigQuery Table Schema):
{schema_columns.to_markdown(index = False)}

Write a query for Google BigQuery using fully qualified table names to answer this question:
{question}
"""

    context_query = codegen_model.predict(context_prompt, max_output_tokens = 256)
    
    # extract query from response
    if context_query.text.find("```") >= 0:
        context_query = context_query.text.split("```")[1]
        if context_query.startswith('sql'):
            context_query = context_query[3:]
        print('Initial Query:\n', context_query)
    else:
        print('No query provided (first try) - unforseen error, printing out response to help with editing this funcntion:\n', query_response.text)
    
    return context_query  

In [94]:
def codechat_start(question, query, schema_columns):

    # code chat model
    codechat_model = vertexai.language_models.CodeChatModel.from_pretrained('codechat-bison@002')
    
    # start a code chat session and give the schema for columns as the starting context:
    codechat = codechat_model.start_chat(
        context = f"""
The BigQuery Environment has tables defined by the follow schema:
{schema_columns.to_markdown(index = False)}

This session is trying to troubleshoot a Google BigQuery SQL query that is being writen to answer the question:
{question}

BigQuery SQL query that needs to be fixed:
{query}

Instructions:
As the user provides versions of the query and the errors returned by BigQuery, offer suggestions that fix the errors but it is important that the query still answer the original question.
"""
    )
    
    return codechat

In [95]:
def fix_query(query, max_fixes):
    
    # iteratively run query, and fix it using codechat until success (or max_fixes reached):
    fix_tries = 0
    answer = False
    while fix_tries < max_fixes:
        if not query: 
            return
        # run query:
        query_job = bq.query(query = query)
        # if errors, then generate repair query:
        if query_job.errors:
            fix_tries += 1
            
            if fix_tries == 1:
                codechat = codechat_start(question, query, schema_columns)
            
            # construct hint from error
            hint = ''
            for error in query_job.errors:
                # detect error message
                if 'message' in list(error.keys()):
                    # detect index of error location
                    if error['message'].rindex('[') and error['message'].rindex(']'):
                        begin = error['message'].rindex('[') + 1
                        end = error['message'].rindex(']')
                        # verify that it looks like an error location:
                        if end > begin and error['message'][begin:end].index(':'):
                            # retrieve the two parts of the error index: query line, query column
                            query_index = [int(q) for q in error['message'][begin:end].split(':')]
                            hint += query.split('\n')[query_index[0]-1].strip()
                            break
            
            # construct prompt to request a fix:
            fix_prompt = f"""This query:\n{query}\n\nReturns these errors:\n{query_job.errors}"""

            if hint != '':
                fix_prompt += f"""\n\nHint, the error appears to be in this line of the query:\n{hint}"""            
            
            query_response = codechat.send_message(fix_prompt)
            query_response = codechat.send_message('Respond with only the corrected query that still answers the question as a markdown code block.')
            if query_response.text.find("```") >= 0:
                query = query_response.text.split("```")[1]
                if query.startswith('sql'):
                    query = query[4:]
                print(f'Fix #{fix_tries}:\n', query)
            # response did not have a query????:
            else:
                query = ''
                print('No query in response...')

        # no error, break while loop
        else:
            break
    
    return query, query_job, fix_tries, codechat

In [96]:
def answer_question(question, query_job):

    # text generation model
    gemini_model = vertexai.preview.generative_models.GenerativeModel("gemini-pro")

    # answer question
    result = query_job.to_dataframe()
    question_prompt = f"""
Answer the following question.  Note that the context is a tabular result returned from a BigQuery query specific to this question.  Do not repeat the question or the context when responding.
{question}

Use this data:
{result.to_markdown(index = False)}
    """

    question_response = gemini_model.generate_content(question_prompt)
    
    return question_response.text

In [97]:
def BQ_QA(question, max_fixes = 10, schema_columns = schema_columns):
    
    # generate query
    query = initial_query(question, schema_columns)
    
    # run query:
    query_job = bq.query(query = query)
    # if errors, then generate repair query:
    if query_job.errors:
        print('found errors')
        query, query_job, fix_tries, codechat = fix_query(query, max_fixes)
    
    # respond with outcome:
    if query_job.errors:
        print(f'No answer generated after {fix_tries} tries.')
        return codechat
    else:
        question_response = answer_question(question, query_job)
        print(question_response)
        try:
            return codechat
        except:
            return None

In [98]:
question

'What is the average age of mothers giving birth in the year of 2018?'

In [99]:
session = BQ_QA(question)

Initial Query:
 
SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';
found errors
Fix #1:
 SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  Year = '2018';
Fix #2:
 SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  Year = DATE('2018');
Fix #3:
 SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  Year = '2018';
Fix #4:
 SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  CAST(Year as DATE) = DATE('2018');
Fix #5:
 SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_nata

In [100]:
for message in session.message_history:
    print(message.content)

This query:

SELECT
  AVG(county_natality.Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  county_natality.Year = '2018';


Returns these errors:
[{'reason': 'invalidQuery', 'location': 'query', 'message': 'Unrecognized name: county_natality at [7:3]'}]

Hint, the error appears to be in this line of the query:
county_natality.Year = '2018';
 The error message suggests that the `county_natality` table is not recognized. This could be because the table name is misspelled or because the table is not in the expected dataset. To fix this, make sure that the table name is spelled correctly and that the table is in the `bigquery-public-data.sdoh_cdc_wonder_natality` dataset.

Here's the corrected query:

```sql
SELECT
  AVG(Ave_Age_of_Mother) AS average_age_of_mothers
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE
  Year = '2018';
```
Respond with only the corrected query that still answer

In [101]:
session = BQ_QA("How does the average pre-pregnancy BMI vary by county?")

Initial Query:
 
SELECT
  county_natality.County_of_Residence,
  AVG(county_natality.Ave_Pre_pregnancy_BMI) AS average_pre_pregnancy_bmi
FROM
  bigquery-public-data.sdoh_cdc_wonder_natality.county_natality
GROUP BY
  county_natality.County_of_Residence
ORDER BY
  average_pre_pregnancy_bmi DESC;
The lowest pre-pregnancy BMI values are in San Francisco County (24.0067), New York County (23.8967), and Arlington County (24.3567).


In [104]:
session = BQ_QA("How does the average gestational age at birth vary by the mother's single race and show me the number?")

Initial Query:
 
SELECT
  county_natality_by_mother_race.Mothers_Single_Race,
  AVG(county_natality_by_mother_race.Ave_OE_Gestational_Age_Wks) AS average_gestational_age,
  COUNT(county_natality_by_mother_race.Births) AS birth_count
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`
GROUP BY
  county_natality_by_mother_race.Mothers_Single_Race
ORDER BY
  average_gestational_age DESC;
found errors
Fix #1:
 SELECT
  county_natality.Mothers_Single_Race,
  AVG(county_natality.Ave_OE_Gestational_Age_Wks) AS average_gestational_age,
  COUNT(county_natality.Births) AS birth_count
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
GROUP BY
  county_natality.Mothers_Single_Race
ORDER BY
  average_gestational_age DESC;
Fix #2:
 SELECT
  county_natality_by_mother_race.Mothers_Single_Race,
  AVG(county_natality_by_mother_race.Ave_OE_Gestational_Age_Wks) AS average_gestational_age,
  COUNT(county_natality_by_mother_race.Births) AS birth_count
F

In [106]:
session = BQ_QA("Which mother's single race category reports the highest average number of births and what is that number?")

Initial Query:
 
SELECT
  Mothers_Single_Race,
  AVG(Births) AS average_births
FROM
  `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race`
GROUP BY
  Mothers_Single_Race
ORDER BY
  average_births DESC
LIMIT 1;
White, with an average of 4528.37 births


In [107]:
session = BQ_QA("How does the average gestational age at birth vary by the mother's single race?")

Initial Query:
 
SELECT
  county_natality_by_mother_race.Year,
  county_natality_by_mother_race.Mothers_Single_Race,
  AVG(county_natality_by_mother_race.Ave_OE_Gestational_Age_Wks) AS Average_OE_Gestational_Age_Weeks
FROM
  bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_mother_race
GROUP BY
  county_natality_by_mother_race.Year,
  county_natality_by_mother_race.Mothers_Single_Race
ORDER BY
  county_natality_by_mother_race.Year,
  Average_OE_Gestational_Age_Weeks DESC;
The average gestational age at birth for White mothers is consistently higher than all other races by about 0.2 years (8-9 days). American Indian or Alaska Native mothers have gestational ages about 0.1 less than White mothers, but consistently about 0.02 fewer than all other races. This means that Black or African American mothers, along with Asian, Hawaiian, or Mixed-race mothers (more than one race) each have consistently younger gestational ages than White mothers by at least 0.4 or about 17 days.
