In [1]:
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.llms.openai import OpenAI
from langchain.agents import AgentExecutor
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI

# adding Folder_2 to the system path
import sys
sys.path.insert(0, '../')
from common.utils import Utils


In [2]:

#db = SQLDatabase.from_uri("mysql+mysqlconnector://xxx:xxx@localhost:3306/sys")
db = SQLDatabase.from_uri('postgresql+psycopg2://flowise:flowise@localhost/metastore')


In [5]:
llm = ChatOpenAI(model='gpt-4-0613', temperature=0, openai_api_key = Utils.get_openai_key())
toolkit = SQLDatabaseToolkit(db=db, llm=llm)

## 1 Agent with No Support Vector Store

In [7]:
agent_executor_1 = create_sql_agent(
    llm=ChatOpenAI(model='gpt-4-0613', temperature=0, openai_api_key = Utils.get_openai_key()),
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""
    To answer my questions, you need to do following things:
    1. generate a correct SQL
    2. send the SQL to database and get the answer from database
    3. Base on the answer from database, give me the answer
    4. give me the SQL that you generated
    5. explain the SQL that you generated
    """
)
response = agent_executor_1.run("Find investors who trade for three consecutive days")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``
responded: To answer this question, I first need to know the structure of the database. I will use the `sql_db_list_tables` tool to get a list of all tables in the database.

[0m[38;5;200m[1;3minvestor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `investor_trade`
responded: The database contains the following tables: investor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition. 

To find investors who trade for three consecutive days, I will need to use the investor_trade table. I will use the `sql_db_schema` tool to get the schema and sample rows for this table.

[0m[33;1m[1;3m
CREATE TABLE investor_trade (
	holding_date DATE NOT NULL, 
	investor_id VARCHAR(255) NOT NULL, 
	stock_code VARCHAR(255) NOT NULL, 
	direction VARCHAR(255), 
	vo

In [11]:
agent_executor_1 = create_sql_agent(
    llm=ChatOpenAI(model='gpt-4-0613', temperature=0, openai_api_key = Utils.get_openai_key()),
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""When giving the final answer, also append the final SQL and explain it."""
)
response = agent_executor_1.run("Find investors who trade for three consecutive days")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo answer this question, I would need to know the structure of the database. I will first list all the tables in the database. 

Let's use the `sql_db_list_tables` tool to get the list of tables. 

```typescript
const tables = await ai.tools.functions.sql_db_list_tables({ __arg1: "" });
```

After getting the list of tables, I will check the schema of the relevant tables. Let's assume that the relevant table is `trades`. 

I will use the `sql_db_schema` tool to get the schema of the `trades` table.

```typescript
const schema = await ai.tools.functions.sql_db_schema({ __arg1: "trades" });
```

Assuming the `trades` table has the following columns: `investor_id`, `trade_date`.

The SQL query to find investors who trade for three consecutive days would look something like this:

```sql
SELECT investor_id
FROM (
    SELECT investor_id, trade_date,
           trade_date - ROW_NUMBER() OVER(PARTITION BY investor_id ORDER BY trade_

In [8]:
print(response)

The investors who traded for three consecutive days are investors with the IDs 1, 2, and 3. 

The SQL query I used to find this information is:

```sql
SELECT investor_id
FROM (
    SELECT investor_id, holding_date,
           holding_date - (ROW_NUMBER() OVER(PARTITION BY investor_id ORDER BY holding_date) * INTERVAL '1 day') as grp
    FROM investor_trade
) t
GROUP BY investor_id, grp
HAVING COUNT(*) >= 3
LIMIT 10;
```

This query works by creating a group for each set of consecutive dates for each investor. It then counts the number of dates in each group and returns the investor_id for groups with three or more dates.


In [10]:
agent_executor_1 = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""When giving the final answer, also append the final SQL and explain it."""
)
response = agent_executor_1.run("""Find the accumulated profits and loss for investor mike, use the close price as the trade price""")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``


[0m[38;5;200m[1;3minvestor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `investor_definition, investor_trade, stock_price`


[0m[33;1m[1;3m
CREATE TABLE investor_definition (
	id VARCHAR(32) NOT NULL, 
	investor_name VARCHAR(90), 
	investor_country VARCHAR(90), 
	gender VARCHAR(255), 
	age INTEGER, 
	num_years_invest INTEGER, 
	busu_unit VARCHAR(255) NOT NULL, 
	country VARCHAR(255) NOT NULL, 
	CONSTRAINT investor_definition_pkey PRIMARY KEY (id)
)

/*
3 rows from investor_definition table:
id	investor_name	investor_country	gender	age	num_years_invest	busu_unit	country
1	mike	CN	male	29	10	RMG	CN
2	jack	IN	male	22	5	T&O	IN
3	yoyo	SG	female	29	7	MOT	SG
*/


CREATE TABLE investor_trade (
	holding_date DATE NOT NULL, 
	investor_id VARCHAR(255) NOT NULL, 
	stock_code VARCHAR(255) NOT 

In [13]:
agent_executor_1 = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""When giving the final answer, also append the final SQL and explain it."""
)
response = agent_executor_1.run("""Find the accumulated profits and loss for investor mike, use the close price as the trade price. 
the accumulated_profit_loss = total_money_on_selling_stock - total_money_on_buying_stock + current_value_held_stock""")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``


[0m[38;5;200m[1;3minvestor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `investor_definition, investor_trade, stock_price`


[0m[33;1m[1;3m
CREATE TABLE investor_definition (
	id VARCHAR(32) NOT NULL, 
	investor_name VARCHAR(90), 
	investor_country VARCHAR(90), 
	gender VARCHAR(255), 
	age INTEGER, 
	num_years_invest INTEGER, 
	busu_unit VARCHAR(255) NOT NULL, 
	country VARCHAR(255) NOT NULL, 
	CONSTRAINT investor_definition_pkey PRIMARY KEY (id)
)

/*
3 rows from investor_definition table:
id	investor_name	investor_country	gender	age	num_years_invest	busu_unit	country
1	mike	CN	male	29	10	RMG	CN
2	jack	IN	male	22	5	T&O	IN
3	yoyo	SG	female	29	7	MOT	SG
*/


CREATE TABLE investor_trade (
	holding_date DATE NOT NULL, 
	investor_id VARCHAR(255) NOT NULL, 
	stock_code VARCHAR(255) NOT 

In [17]:
agent_executor_1 = create_sql_agent(
    llm=ChatOpenAI(model='gpt-4-0613', temperature=0, openai_api_key = Utils.get_openai_key()),
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    suffix="""
    To answer my questions, you need to do following steps:
    1. generate a correct SQL
    2. send the SQL to database and get the answer from database
    3. Base on the answer from database, give me the answer
    4. give me the SQL that you generated
    5. explain the SQL that you generated
    """
)
response = agent_executor_1.run("""Find the accumulated profits and loss for investor with name mike, use close price as the trade price. 
formula for accumulated profits and loss:
x = stock volume that the investor holds for a specific day
m = stock price for a specific day
c = total buy for a date, x * m * -1
s = total sell for a date, x * m * 1
f = profit and loss for a date, s - c
y = accumulated profits and loss, sum all the f
""")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``
responded: To generate the correct SQL, I first need to understand the structure of the database. I will use the `sql_db_list_tables` tool to get a list of all the tables in the database.

[0m[38;5;200m[1;3minvestor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `investor_definition, investor_trade, stock_definition, stock_price`
responded: The database contains the following tables: investor_definition, investor_trade, stock_definition, stock_dividend, stock_price, stock_type_definition.

To generate the SQL, I need to know the schema of these tables. I will use the `sql_db_schema` tool to get the schema and sample rows for these tables.

[0m[33;1m[1;3m
CREATE TABLE investor_definition (
	id VARCHAR(32) NOT NULL, 
	investor_name VARCHAR(90), 
	investor_country VARCHAR(90), 
	gender V

## 2 Agent With Vector Store Search For Similar Question and SQL Pair

### 2.1 Create Vector Store of Quest SQl Pairs

In [27]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

embeddings =  OpenAIEmbeddings(openai_api_key = Utils.get_openai_key())

few_shotd = few_shots = {'Find all employees whose departments are in America': "SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');"
                        }
few_shot_docs = [
    Document(page_content=question, metadata={"sql_query": few_shots[question]})
    for question in few_shots.keys()
]

#vector_db = FAISS.from_documents(few_shot_docs, embeddings)
#vector_db.save_local("similar_example_store_index")
vector_db = FAISS.load_local("similar_example_store_index", embeddings)
retriever = vector_db.as_retriever()

from langchain.agents.agent_toolkits import create_retriever_tool

tool_description = """
This tool will help you understand similar examples to adapt them to the user question.
Input to this tool should be the user question.
"""

retriever_tool_example = create_retriever_tool(
    retriever, name="sql_get_similar_examples", description=tool_description
)

In [31]:
custom_tool_list = [retriever_tool_example]

custom_suffix_example = """
I should first get the similar examples I know.
If the examples are enough to construct the query, I can build it.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL and explain it.
"""

agent_2 = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix_example,
)

`agent_scratchpad` should be a variable in prompt.input_variables. Did not find it, so adding it at the end.


In [None]:
response = agent_2.run("How many employees are there, whose departments are in america")

In [30]:
print(response)

There are 69 employees whose departments are located in America.

The SQL query used to get this answer is:
```sql
SELECT COUNT(*) 
FROM employees e 
JOIN departments d ON e.department_id = d.department_id 
WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');
```
This query first joins the employees and departments tables on the department_id. It then filters for departments that are located in America (represented by the country_id 'US'). The count of these employees is then returned.


## 3. Agent with Vector Store Checking Filtering Condition

In [5]:
import ast
import re


def run_query_save_results(db, query):
    res = db.run(query)
    res = [el for sub in ast.literal_eval(res) for el in sub if el]
    res = [re.sub(r"\b\d+\b", "", string).strip() for string in res]
    return res


country_name = run_query_save_results(db, "SELECT country_name From countries")
country_id = run_query_save_results(db, "SELECT country_id From countries")
department_name = run_query_save_results(db, "SELECT  department_name FROM departments")

In [14]:
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS


texts = country_name + country_id + department_name

embeddings = OpenAIEmbeddings(openai_api_key = Utils.get_openai_key())

#embedding
#vector_db_1 = FAISS.from_texts(texts, embeddings)
#vector_db_1.save_local("name_search_store_index")
vector_db_1 = FAISS.load_local("name_search_store_index", embeddings)

#construct custom tool
retriever_1 = vector_db_1.as_retriever()
retriever_tool_name = create_retriever_tool(
    retriever_1,
    name="name_search",
    description="use to learn how a piece of data is actually written, can be from names, surnames addresses etc",
)
custom_tool_list = [retriever_tool_name]

In [17]:
from langchain.agents import create_sql_agent, AgentType
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.utilities import SQLDatabase
from langchain.chat_models import ChatOpenAI

# db = SQLDatabase.from_uri("sqlite:///Chinook.db")
llm = ChatOpenAI(model_name="gpt-4", temperature=0, openai_api_key = Utils.get_openai_key())

toolkit = SQLDatabaseToolkit(db=db, llm=llm)

custom_suffix = """
If a query statement asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL to it and explain the SQL.
"""

agent_3 = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix,
)

In [18]:
print(agent_3.run("How many employees's departments are in British"))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `name_search` with `British`


[0m[33;1m[1;3m[Document(page_content='UK'), Document(page_content='US'), Document(page_content='United Kingdom'), Document(page_content='Australia')][0m[32;1m[1;3m
Invoking: `sql_db_list_tables` with ``
responded: The term "British" could refer to the United Kingdom. Let's proceed with this assumption and check the tables in the database.

[0m[38;5;200m[1;3mcountries, departments, employees, locations, t_stock[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `employees, departments, locations`
responded: The relevant tables for this query seem to be "employees", "departments", and "locations". Let's check the schema of these tables to understand their structure and relationships.

[0m[33;1m[1;3m
CREATE TABLE departments (
	department_id INTEGER NOT NULL, 
	department_name VARCHAR(30) NOT NULL, 
	manager_id INTEGER, 
	location_id INTEGER, 
	CONSTRAINT dept_id_pk PRIMARY KEY 

## 4. Agent with Vector Stores Support Similar Query Search and Filtering Condition Checking

In [7]:

custom_suffix = """
I should first get the similar examples I know.
If the examples are enough to construct the query, I can build it.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

If a user asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.
Otherwise, I can then look at the tables in the database to see what I can query.
Then I should query the schema of the most relevant tables.

When giving the final answer, also append the final SQL to it and explain the SQL in natural languages.
"""

custom_tool_list = [retriever_tool_name, retriever_tool_example]

agent_compose = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    extra_tools=custom_tool_list,
    suffix=custom_suffix,
)

In [51]:
agent_compose.run("How many employees's departments are in America")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_get_similar_examples` with `How many employees's departments are in America`


[0m[38;5;200m[1;3m[Document(page_content='Find all employees whose departments are in America', metadata={'sql_query': "SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');"})][0m[32;1m[1;3m
Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM employees e JOIN departments d ON e.department_id = d.department_id WHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');`
responded: The similar example provided gives me a good starting point to construct the SQL query. The query is counting the number of employees whose departments are located in America. The 'US' country_id is used to represent America in the database. Let's construct the query.

[0m[36;1m[1;3mSELECT COUNT(*) 

"There are 69 employees whose departments are located in America.\n\nThe SQL query used to find this is: \n```sql\nSELECT COUNT(*) \nFROM employees e \nJOIN departments d ON e.department_id = d.department_id \nWHERE d.location_id IN (SELECT location_id FROM locations WHERE country_id = 'US');\n```\nThis query first joins the employees and departments tables on the department_id. It then filters for departments that are located in America, represented by the country_id 'US' in the locations table. The count of these employees is then returned."

In [10]:
response = "There are 35 employees whose departments are in the United Kingdom.\n\nThe SQL query I used to get this answer is:\n```sql\nSELECT COUNT(*) \nFROM employees e \nJOIN departments d ON e.department_id = d.department_id \nWHERE d.location_id IN (\n    SELECT location_id \n    FROM locations \n    WHERE country_id = 'UK'\n);\n```\nThis query first joins the employees and departments tables on the department_id column. Then it filters the rows where the location_id of the department is in the list of location_ids that are in the UK (obtained from the locations table). Finally, it counts the number of such employees."
print(response)

There are 35 employees whose departments are in the United Kingdom.

The SQL query I used to get this answer is:
```sql
SELECT COUNT(*) 
FROM employees e 
JOIN departments d ON e.department_id = d.department_id 
WHERE d.location_id IN (
    SELECT location_id 
    FROM locations 
    WHERE country_id = 'UK'
);
```
This query first joins the employees and departments tables on the department_id column. Then it filters the rows where the location_id of the department is in the list of location_ids that are in the UK (obtained from the locations table). Finally, it counts the number of such employees.
