In [1]:
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.llms.openai import OpenAI
from langchain.agents import AgentExecutor
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI

# adding Folder_2 to the system path
import sys
sys.path.insert(0, '../')
from common.utils import Utils


In [2]:

#db = SQLDatabase.from_uri("mysql+mysqlconnector://xxx:xxx@localhost:3306/sys")
db = SQLDatabase.from_uri('postgresql+psycopg2://test:test@localhost/test')


In [3]:
llm = ChatOpenAI(model='gpt-4-0613', temperature=0, openai_api_key = Utils.get_openai_key())
toolkit = SQLDatabaseToolkit(db=db, llm=llm)

## Agent with No Support Vector Store

In [4]:

agent_executor_2 = create_sql_agent(
    llm=ChatOpenAI(model='gpt-4-0613',temperature=0, openai_api_key = Utils.get_openai_key()),
    toolkit=toolkit,
    verbose=False,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""When giving the final answer, also append the final SQL and explain it."""
)


In [5]:

response = agent_executor_2.run("How many employees's departments are in america")
print(response)

There are no employees whose departments are located in America.

The SQL query used to find this is:
```sql
SELECT COUNT(DISTINCT e.employee_id) 
FROM employees e 
JOIN departments d ON e.department_id = d.department_id 
JOIN locations l ON d.location_id = l.location_id 
JOIN countries c ON l.country_id = c.country_id 
WHERE c.country_name = 'America'
```
This query works by joining the employees, departments, locations, and countries tables together. It then counts the number of unique employee IDs where the country name is 'America'.


## Agent With Vector Store Search For Similar Question and SQL Pair

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

embeddings =  OpenAIEmbeddings(openai_api_key = Utils.get_openai_key())

few_shotd = few_shots = {'Find all employees whose departments are in America': "SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');"
                        }
few_shot_docs = [
    Document(page_content=question, metadata={"sql_query": few_shots[question]})
    for question in few_shots.keys()
]

#vector_db = FAISS.from_documents(few_shot_docs, embeddings)
#vector_db.save_local("similar_example_store_index")
vector_db = FAISS.load_local("similar_example_store_index", embeddings)
retriever = vector_db.as_retriever()

from langchain.agents.agent_toolkits import create_retriever_tool

tool_description = """
This tool will help you understand similar examples to adapt them to the user question.
Input to this tool should be the user question.
"""

retriever_tool_example = create_retriever_tool(
    retriever, name="sql_get_similar_examples", description=tool_description
)

In [7]:
custom_tool_list = [retriever_tool_example]

custom_suffix_example = """
I should first get the similar examples I know.
If the examples are enough to construct the query, I can build it.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL and explain it.
"""

agent_1 = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix_example,
)

In [8]:
response = agent_1.run("How many employees's departments are in america")
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_get_similar_examples` with `How many employees's departments are at america`


[0m[33;1m[1;3m[Document(page_content='Find all employees whose departments are in America', metadata={'sql_query': "SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');"})][0m[32;1m[1;3m
Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');`
responded: The similar example provided gives me a good starting point to construct the SQL query. The query is counting the number of employees whose departments are located in America. The 'employees' table is joined with the 'departments' table on the 'department_id' field. The 'WHERE' clause is used to filter the

## Agent with Vector Store Checking Filtering Condition

In [5]:
import ast
import re


def run_query_save_results(db, query):
    res = db.run(query)
    res = [el for sub in ast.literal_eval(res) for el in sub if el]
    res = [re.sub(r"\b\d+\b", "", string).strip() for string in res]
    return res


country_name = run_query_save_results(db, "SELECT country_name From countries")
country_id = run_query_save_results(db, "SELECT country_id From countries")
department_name = run_query_save_results(db, "SELECT  department_name FROM departments")

In [14]:
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS


texts = country_name + country_id + department_name

embeddings = OpenAIEmbeddings(openai_api_key = Utils.get_openai_key())

#embedding
#vector_db_1 = FAISS.from_texts(texts, embeddings)
#vector_db_1.save_local("name_search_store_index")
vector_db_1 = FAISS.load_local("name_search_store_index", embeddings)

#construct custom tool
retriever_1 = vector_db_1.as_retriever()
retriever_tool_name = create_retriever_tool(
    retriever_1,
    name="name_search",
    description="use to learn how a piece of data is actually written, can be from names, surnames addresses etc",
)
custom_tool_list = [retriever_tool_name]

In [17]:
from langchain.agents import create_sql_agent, AgentType
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.utilities import SQLDatabase
from langchain.chat_models import ChatOpenAI

# db = SQLDatabase.from_uri("sqlite:///Chinook.db")
llm = ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key = Utils.get_openai_key())

toolkit = SQLDatabaseToolkit(db=db, llm=llm)

custom_suffix = """
If a query statement asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL to it and explain the SQL.
"""

agent = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix,
)

In [18]:
print(agent.run("How many employees's departments are in British"))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `name_search` with `British`


[0m[33;1m[1;3m[Document(page_content='UK'), Document(page_content='US'), Document(page_content='United Kingdom'), Document(page_content='Australia')][0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``
responded: The term "British" could refer to the United Kingdom. Let's proceed with this assumption and check the tables in the database.

[0m[38;5;200m[1;3mcountries, departments, employees, locations, t_stock[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `employees, departments, locations`
responded: The relevant tables for this query seem to be "employees", "departments", and "locations". Let's check the schema of these tables to understand their structure and relationships.

[0m[33;1m[1;3m
CREATE TABLE departments (
	department_id INTEGER NOT NULL, 
	department_name VARCHAR(30) NOT NULL, 
	manager_id INTEGER, 
	location_id INTEGER, 
	CONSTRAINT dept_id_pk PRIMARY KEY 

## Agent with Vector Stores Support Similar Query Search and Filtering Condition Checking

In [7]:

custom_suffix = """
I should first get the similar examples I know.
If the examples are enough to construct the query, I can build it.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

If a user asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL to it and explain the SQL in natural languages.
"""

custom_tool_list = [retriever_tool_name, retriever_tool_example]

agent_compose = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix,
)

In [9]:
agent_compose.run("How many employees's departments are in British")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_get_similar_examples` with `How many employees's departments are in British`


[0m[38;5;200m[1;3m[Document(page_content='Find all employees whose departments are in America', metadata={'sql_query': "SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');"})][0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``


[0m[38;5;200m[1;3mcountries, departments, employees, locations, t_stock[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `countries, departments, employees, locations`


[0m[33;1m[1;3m
CREATE TABLE countries (
	country_id CHAR(2) NOT NULL, 
	country_name VARCHAR(40), 
	region_id INTEGER, 
	CONSTRAINT country_c_id_pk PRIMARY KEY (country_id)
)

/*
3 rows from countries table:
country_id	country_name	region_id
AR	Argentina	2
AU	Australia	3
BE	Belgium	1
*/


CREATE TABLE departme

"There are 35 employees whose departments are in the United Kingdom.\n\nThe SQL query I used to get this answer is:\n```sql\nSELECT COUNT(*) \nFROM employees e \nJOIN departments d ON e.department_id = d.department_id \nWHERE d.location_id IN (\n    SELECT location_id \n    FROM locations \n    WHERE country_id = 'UK'\n);\n```\nThis query first joins the employees and departments tables on the department_id column. Then it filters the rows where the location_id of the department is in the list of location_ids that are in the UK (obtained from the locations table). Finally, it counts the number of such employees."

In [10]:
response = "There are 35 employees whose departments are in the United Kingdom.\n\nThe SQL query I used to get this answer is:\n```sql\nSELECT COUNT(*) \nFROM employees e \nJOIN departments d ON e.department_id = d.department_id \nWHERE d.location_id IN (\n    SELECT location_id \n    FROM locations \n    WHERE country_id = 'UK'\n);\n```\nThis query first joins the employees and departments tables on the department_id column. Then it filters the rows where the location_id of the department is in the list of location_ids that are in the UK (obtained from the locations table). Finally, it counts the number of such employees."
print(response)

There are 35 employees whose departments are in the United Kingdom.

The SQL query I used to get this answer is:
```sql
SELECT COUNT(*) 
FROM employees e 
JOIN departments d ON e.department_id = d.department_id 
WHERE d.location_id IN (
    SELECT location_id 
    FROM locations 
    WHERE country_id = 'UK'
);
```
This query first joins the employees and departments tables on the department_id column. Then it filters the rows where the location_id of the department is in the list of location_ids that are in the UK (obtained from the locations table). Finally, it counts the number of such employees.
