# Dependencies

In [17]:
import sys
from tqdm.notebook import tqdm

In [3]:
!{sys.executable} -m pip install langchain==0.0.340 --quiet
!{sys.executable} -m pip install chromadb==0.4.13 --quiet
!{sys.executable} -m pip install google-cloud-aiplatform --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.12.0 requires dill<0.3.7,>=0.3.0, but you have dill 0.3.8 which is incompatible.[0m[31m
[0mzsh:1: no matches found: google-cloud-bigquery[pandas]


# BigQuery configuration

Don't change these options. These are the public datasets used in this workshop.

In [5]:
BIGQUERY_DATASET = "wh_raw"
BIGQUERY_PROJECT = "analytics-147612"

In [None]:
from google.oauth2 import service_account

# Schemas as context for the prompt

In [7]:
import json
from google.cloud import bigquery

project = "analytics-147612"
location = "EU"
bq_client = bigquery.Client(project=project, location=location)



In [18]:
bq_tables = bq_client.list_tables(dataset=f"{BIGQUERY_PROJECT}.{BIGQUERY_DATASET}")
schemas = []
for bq_table in tqdm(bq_tables):
    t = bq_client.get_table(
        f"{BIGQUERY_PROJECT}.{BIGQUERY_DATASET}.{bq_table.table_id}"
    )
    schema_fields = [f.to_api_repr() for f in t.schema]
    schema = f"The schema for table {bq_table.table_id} is the following: \n```{json.dumps(schema_fields, indent=1)}```"
    schemas.append(schema)

print(f"Found {len(schemas)} tables in dataset {BIGQUERY_PROJECT}:{BIGQUERY_DATASET}")

0it [00:00, ?it/s]

Found 1464 tables in dataset analytics-147612:wh_raw


# Vector store

We add the schemas as documents to a vector store, to be added to the prompt later.

We will retrieve only one document from the store for the prompt: the most relevant doc.

In [None]:
import openai
from langchain.embeddings import (
    OpenAIEmbeddings,
)  # This is hypothetical; adjustments may be necessary
from langchain.vectorstores import Chroma

# Assuming you have set up OpenAI API key in your environment variables
# or you can set it manually as follows:
# openai.api_key = "your-api-key-here"

embeddings = OpenAIEmbeddings(
    api_key=openai.api_key
)  # Adjust according to actual implementation

# Initialize the vector store, assuming Chroma can be used similarly with OpenAI embeddings
try:  # Avoid duplicated documents
    vector_store.delete_collection()
except:
    print("No need to clean the vector store")
vector_store = Chroma.from_texts(schemas, embedding=embeddings)

n_docs = len(vector_store.get()["ids"])
retriever = vector_store.as_retriever(search_kwargs={"k": 1})
print(f"The vector store has {n_docs} documents")

In [None]:
from langchain.embeddings import VertexAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = VertexAIEmbeddings()
try:  # Avoid duplicated documents
    vector_store.delete_collection()
except:
    print("No need to clean the vector store")
vector_store = Chroma.from_texts(schemas, embedding=embeddings)
n_docs = len(vector_store.get()["ids"])
retriever = vector_store.as_retriever(search_kwargs={"k": 1})
print(f"The vector store has {n_docs} documents")

# Models

In [None]:
from langchain.chat_models import ChatVertexAI
from langchain.llms import VertexAI

query_model = ChatVertexAI(model_name="codechat-bison", max_output_tokens=2048)
interpret_data_model = ChatVertexAI(max_output_tokens=2048)
agent_model = ChatVertexAI(max_output_tokens=1024)

# Get a SQL query chain

In [None]:
SQL_PROMPT = """You are a SQL and BigQuery expert.

Your job is to create a query for BigQuery in SQL.

The following paragraph contains the schema of the table used for a query. It is encoded in JSON format.

{context}

Create a BigQuery SQL query for the following user input, using the above table.

The user and the agent have done this conversation so far:
{chat_history}

Follow these restrictions strictly:
- Only return the SQL code.
- Do not add backticks or any markup. Only write the query as output. NOTHING ELSE.
- In FROM, always use the full table path, using `{project}` as project and `{dataset}` as dataset.
- Always transform country names to full uppercase. For instance, if the country is Japan, you should use JAPAN in the query.

User input: {question}

SQL query:
"""

In [None]:
from langchain.schema.vectorstore import VectorStoreRetriever


def get_documents(retriever: VectorStoreRetriever, question: str) -> str:
    # Return only the first document
    output = ""
    for d in retriever.get_relevant_documents(question):
        output += d.page_content
        output += "\n"
        return output

In [None]:
from operator import itemgetter
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser

# Write a chain of name query
### TODO
### TODO
### TODO
### TODO
### TODO

In [None]:
from langchain.callbacks.tracers import ConsoleCallbackHandler

# Example
x = {
    "input": "Which countries in Asia had more houses damaged? Give me the top 3",
    "chat_history": "",
}
print(query.invoke(x, config={"callbacks": [ConsoleCallbackHandler()]}))

# Add more outputs to the previous chain

In [None]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.schema.runnable import RunnableLambda


def _dict_to_json(x: dict) -> str:
    return "```\n" + json.dumps(x) + "\n```"


query_response_schema = [
    ResponseSchema(name="query", description="SQL query to solve the user question."),
    ResponseSchema(name="question", description="Question asked by the user."),
    ResponseSchema(
        name="context", description="Documents retrieved from the vector store."
    ),
]
query_output_parser = StructuredOutputParser.from_response_schemas(
    query_response_schema
)
query_output_json = (
    docs
    | question
    | {"query": query}
    | RunnableLambda(_dict_to_json)
    | StrOutputParser()
)
query_output = query_output_json | query_output_parser

In [None]:
# Example
x = {
    "input": "Which countries in Asia had more houses damaged? Give me the top 3",
    "chat_history": "",
}
query_output.invoke(x)  # Output is now a dictionary, input for the next chain

# Interpret the output chain

In [None]:
INTERPRET_PROMPT = """You are a BigQuery expert. You are also expert in extracting data from CSV.

The following paragraph describes the schema of the table used for a query. It is encoded in JSON format.

{context}

A user asked this question:
{question}

To find the answer, the following SQL query was run in BigQuery:
```
{query}
```

The result of that query was the following table in CSV format:
```
{result}
```

Based on those results, provide a brief answer to the user question.

Follow these restrictions strictly:
- Do not add any explanation about how the answer is obtained, just write the answer.
- Extract any value related to the answer only from the result of the query. Do not use any other data source.
- Just write the answer, omit the question from your answer, this is a chat, just provide the answer.
- If you cannot find the answer in the result, do not make up any data, just say "I cannot find the answer"
"""

In [None]:
from google.cloud import bigquery


def get_bq_csv(bq_client: bigquery.Client, query: str) -> str:
    df = bq_client.query(query, location="US").to_dataframe()
    return df.to_csv(index=False)

In [None]:
# Get the output of the previous chain

# Write a chain of name run_bq_result
### TODO
### TODO
### TODO
### TODO
### TODO

In [None]:
# Example
x = {
    "input": "Which countries in Asia had more houses damaged? Give me the top 3",
    "chat_history": "",
}
print(run_bq_result.invoke(query_output.invoke(x)))

# Agent: putting everything together

In [None]:
from langchain.memory import ConversationBufferWindowMemory

agent_memory = ConversationBufferWindowMemory(
    memory_key="chat_history", k=10, return_messages=True
)

In [None]:
AGENT_PROMPT = """You are a very powerful assistant that can answer questions using BigQuery.

You can invoke the tool user_question_tool to answer questions using BigQuery.

You can invoke the tool Calculator if you need to do mathematical operations.

Always use the tools to try to answer the questions. Use the chat history for context. Never try to use any other external information.

Assume that the user may write with misspellings, fix the spelling of the user before passing the question to any tool.

Don't mention what tool you have used in your answer.
"""

In [None]:
from langchain import LLMMathChain
from langchain.tools import Tool

math_chain = LLMMathChain.from_llm(llm=agent_model)
math_tool = Tool(
    name="Calculator",
    description="Useful for when you need to answer questions about math.",
    func=math_chain.run,
    coroutine=math_chain.arun,
)

In [None]:
from langchain.tools import tool
from langchain.callbacks.tracers import ConsoleCallbackHandler


@tool
def user_question_tool(question) -> str:
    """Useful to answer natural language questions from users using BigQuery."""
    config = {"callbacks": [ConsoleCallbackHandler()]}
    memory = agent_memory.buffer_as_str.strip()
    question = {"input": question, "chat_history": memory}
    result = run_bq_result.invoke(
        query_output.invoke(question, config=config), config=config
    )
    return result.strip()

In [None]:
from langchain.agents import AgentType, initialize_agent, AgentExecutor

agent_kwgards = {"system_message": AGENT_PROMPT}
agent_tools = [math_tool, user_question_tool]

agent_memory.clear()

# Fill the missing options
agent = initialize_agent(
    ### TODO
    ### TODO
    ### TODO
    agent_kwgards=agent_kwgards,
    max_iterations=5,
    early_stopping_method="generate",
    verbose=True,
)

In [None]:
q = "Which countries had more houses damaged? Give me the top 3"
agent.invoke(x)

In [None]:
agent_memory

In [None]:
q = "Of those countries, which one had more deaths?"
agent.invoke(q)