# Getting M-Schema Up and Running

## Testing SQLAlchemy and create_engine Connection

In [None]:
from sqlalchemy import create_engine, text

try:
    db_engine = create_engine("oracle+oracledb://MONDIAL_GPT:TextDB123@localhost:1522/?service_name=XEPDB1")
    with db_engine.connect() as connection:
        result = connection.execute(text("SELECT name FROM v$database"))
        print(result.fetchall())
except Exception as e:
    print("Connection failed:", e)

## Importing and Testing M-Schema

In [9]:
from mschema.schema_engine import SchemaEngine

db_name = "MONDIAL_GPT"

schema_engine = SchemaEngine(engine=db_engine, db_name=db_name)
mschema = schema_engine.mschema
mschema_str = mschema.to_mschema()
print(mschema_str)
mschema.save(f'./{db_name}.json')

  self._metadata.reflect(
  self._metadata.reflect(
  self._metadata.reflect(
  fields = self._inspector.get_columns(table_name, schema=self._schema)
  fields = self._inspector.get_columns(table_name, schema=self._schema)
  fields = self._inspector.get_columns(table_name, schema=self._schema)


【DB_ID】 MONDIAL_GPT
【Schema】
# Table: countryothername
[
(country:VARCHAR, Primary Key),
(othername:VARCHAR, Primary Key)
]
# Table: mondial_islandin
[
(island:VARCHAR, Examples: [Chongming, Manhattan, Svalbard]),
(sea:VARCHAR, Examples: [Norwegian Sea, Greenland Sea, Barents Sea]),
(lake:VARCHAR, Examples: [Lake Huron, Lake Nicaragua, Ozero Baikal]),
(river:VARCHAR, Examples: [Yangtze, Hudson River, Donau]),
(meta_repcol:VARCHAR, Examples: [Chongming, Manhattan, Svalbard])
]
# Table: geo_island
[
(island:VARCHAR, Primary Key, Examples: [Sumatra, Honshu, Unalaska]),
(country:VARCHAR, Primary Key, Examples: [RI, J, USA]),
(province:VARCHAR, Primary Key, Examples: [Aceh, Aichi, Akita])
]
# Table: mondial_provinceothername
[
(province:VARCHAR, Examples: [Liï¿½ge]),
(country:VARCHAR, Examples: [B, CO, E]),
(othername:VARCHAR, Examples: [Luik]),
(meta_repcol:VARCHAR, Examples: [Luik])
]
# Table: teste_funcionario
[
(idfuncionario:INTEGER, Primary Key, Examples: [1, 2, 3]),
(nome:VARCHAR, Ex

In [15]:
from langchain_openai import ChatOpenAI
from langchain.chains import create_sql_query_chain
from urllib.parse import quote  
from langchain.callbacks import get_openai_callback
import time
from dotenv import load_dotenv
import os
import sys
import json
load_dotenv()

import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

experiment_path = '..\..'
path = os.path.abspath('')
module_path = os.path.join(path, experiment_path)
if module_path not in sys.path:
    sys.path.append(module_path+"\\functions")

from sqldatabase_langchain_utils import SQLDatabaseLangchainUtils

SCHEMA = 'mondial_gpt'
PREFIX = 'mondial'

FILE_NAME_RESULT = f"sql_queries_chatgpt_mschema_{SCHEMA}.json"

def save_queries(queries):
    data = {"queries":queries}
    with open(FILE_NAME_RESULT, "w") as arquivo_json:
        json.dump(data, arquivo_json, indent=4) 

def read_queries():
    with open(FILE_NAME_RESULT, encoding='utf-8', errors='ignore') as json_data:
        data = json.load(json_data, strict=False)
    queries = data["queries"]
    return queries

json_file_path = f"../../datasets/{SCHEMA}_db_connection.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    db_connection = json.load(json_data, strict=False)

db = SQLDatabaseLangchainUtils(db_connection=db_connection)

exclude = [
    f"{SCHEMA}_tmdp",
    f"{SCHEMA}_tmdpmap",
    f"{SCHEMA}_tmds",
    f"{SCHEMA}_tmjmap",
    f"{SCHEMA}_tpv",
    f"{SCHEMA}_tmdc",
    f"{SCHEMA}_tmdcmap",
    f"{SCHEMA}_tmdej",
    f"{SCHEMA}_log_action",
    f"{SCHEMA}_log_error",
    f"{SCHEMA}_favorite_item", 
    f"{SCHEMA}_favorite_query",
    f"{SCHEMA}_favorite_tag",
    f"{SCHEMA}_favorite_tag_item",
    f"{SCHEMA}_favorite_visualization",
    f"{SCHEMA}_dashboard",
    f"{SCHEMA}_history",
    "teste_cliente",
    "teste_fornecedor",
    "teste_funcionario"
]

include_tables = [s for s in db.get_table_names() if not s.startswith(PREFIX) and s not in exclude]
db = SQLDatabaseLangchainUtils(db_connection=db_connection, include_tables=include_tables)

  self._metadata.reflect(
  self._metadata.reflect(
  self._metadata.reflect(


### Prompt

In [20]:
from langchain.prompts.prompt import PromptTemplate

f = open(f"prompt.txt", "r")
prompt_template = f.read()
f.close()

prompt_template += mschema_str
prompt_template += "\n\nQuestion: {input}"

PROMPT = PromptTemplate(
    input_variables=["input", "top_k", "table_info"], template=prompt_template
)

print(PROMPT)

input_variables=['input', 'table_info', 'top_k'] input_types={} partial_variables={} template='You are an Oracle SQL expert. Given an input question, first create a syntactically correct Oracle SQL query to run, then look at the results of the query and return the answer to the input question.\nUnless the user specifies in the question a specific number of examples to obtain, don\'t query for at {top_k} most results or any using the FETCH FIRST n ROWS ONLY clause as per Oracle SQL. You can order the results to return the most informative data in the database.\nNever query for all columns from a table. You must query only the columns that are needed to answer the question.\nPay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.\nPay attention to use TRUNC(SYSDATE) function to get the current date, if the question involves "today". \n\nSome hints:\n- Don\'t u

In [21]:
query_chain  = create_sql_query_chain(ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-16k'), db.db, prompt=PROMPT)
query_chain 

RunnableAssign(mapper={
  input: RunnableLambda(...),
  table_info: RunnableLambda(...)
})
| RunnableLambda(lambda x: {k: v for k, v in x.items() if k not in ('question', 'table_names_to_use')})
| PromptTemplate(input_variables=['input', 'table_info'], input_types={}, partial_variables={'top_k': '5'}, template='You are an Oracle SQL expert. Given an input question, first create a syntactically correct Oracle SQL query to run, then look at the results of the query and return the answer to the input question.\nUnless the user specifies in the question a specific number of examples to obtain, don\'t query for at {top_k} most results or any using the FETCH FIRST n ROWS ONLY clause as per Oracle SQL. You can order the results to return the most informative data in the database.\nNever query for all columns from a table. You must query only the columns that are needed to answer the question.\nPay attention to use only the column names you can see in the tables below. Be careful to not query 

In [22]:
json_file_path = f"../../datasets/{PREFIX}/queries_{PREFIX}.json"
with open(json_file_path, encoding='utf-8', errors='ignore') as json_data:
    queries = json.load(json_data, strict=False)
queries = queries['queries']
queries

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': '',
  'type': 'simple'},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': '',
  'type': 'simple'},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': '',
  'type': 'medium'},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': '',
  'type': 'simple'},
 {'id': '5',
  'question': 'What is the total of provinces of Netherlands?',
  'query_string': '',
  'type': 'complex'},
 {'id': '6',
  'question': 'What is the percentage of religious people are hindu in thailand?',
  'query_string': '',
  'type': 'complex'},
 {'id': '7',
  'question': 'List the number of provinces each river flows through.',
  'query_string': '',
  'type': 'medium'},
 {'id': '8',
  'question': 'Find all countries that became independent between 8/1/1910 and 8/1/1950.',
  'query_string': '',
  'type': 'complex'},
 {'id': '9',
  'que

In [23]:
with get_openai_callback() as cb:
    sql_query = query_chain.invoke({"question":"What are the provinces with an area more than 10000?"})
    
    print(cb.total_tokens)
    print(cb.prompt_tokens)
    print(cb.completion_tokens)
    print(cb.total_cost)
sql_query
        

  metadata_table_names = [tbl.name for tbl in self._metadata.sorted_tables]
  for tbl in self._metadata.sorted_tables


15629
15617
12
0.046899


'SELECT name \nFROM province \nWHERE area > 10000'

In [24]:
import warnings
from sqlalchemy import exc

# Suppress SAWarning
warnings.filterwarnings("ignore", category=exc.SAWarning, message=".*Cannot correctly sort tables.*")

number_of_queries_to_delay = 25
count = 0
for instance in queries:
    if count == number_of_queries_to_delay:
        count = 0
        time.sleep(10)
    with get_openai_callback() as cb:
        start_time = time.time()
        sql_query = query_chain.invoke({"question":instance["question"]})
        end_time = time.time()
        instance["query_string"] = sql_query
        instance['total_tokens'] = cb.total_tokens
        instance['prompt_tokens'] = cb.prompt_tokens
        instance['completion_tokens'] = cb.completion_tokens
        instance['total_cost'] = cb.total_cost
        instance['time'] = end_time - start_time
        print(instance['id'], instance['question'], instance["query_string"], instance['time'], instance['total_cost'])
    save_queries(queries)
    count += 1
queries

1 What is the area of Thailand? SELECT area FROM country WHERE name = 'Thailand' 1.449566125869751 0.046877
2 What are the provinces with an area greater than 10000? SELECT name 
FROM province 
WHERE area > 10000 1.618485927581787 0.046899
3 What are the languages spoken in Poland? SELECT name
FROM language
WHERE country = 'PL' 1.1005311012268066 0.046884
4 How deep is Lake Kariba? SELECT depth FROM lake WHERE name = 'Lake Kariba' 0.9672372341156006 0.046881
5 What is the total of provinces of Netherlands? SELECT COUNT(*) FROM province WHERE country = 'NL' 1.389159917831421 0.046883
6 What is the percentage of religious people are hindu in thailand? SELECT percentage
FROM religion
WHERE country = 'TH' AND name = 'Hindu' 1.2618157863616943 0.046934
7 List the number of provinces each river flows through. SELECT river.name, COUNT(DISTINCT located.province) AS num_provinces
FROM river
JOIN located ON river.name = located.river
GROUP BY river.name
ORDER BY num_provinces DESC 1.636632204055

[{'id': '1',
  'question': 'What is the area of Thailand?',
  'query_string': "SELECT area FROM country WHERE name = 'Thailand'",
  'type': 'simple',
  'total_tokens': 15622,
  'prompt_tokens': 15611,
  'completion_tokens': 11,
  'total_cost': 0.046877,
  'time': 1.449566125869751},
 {'id': '2',
  'question': 'What are the provinces with an area greater than 10000?',
  'query_string': 'SELECT name \nFROM province \nWHERE area > 10000',
  'type': 'simple',
  'total_tokens': 15629,
  'prompt_tokens': 15617,
  'completion_tokens': 12,
  'total_cost': 0.046899,
  'time': 1.618485927581787},
 {'id': '3',
  'question': 'What are the languages spoken in Poland?',
  'query_string': "SELECT name\nFROM language\nWHERE country = 'PL'",
  'type': 'medium',
  'total_tokens': 15624,
  'prompt_tokens': 15612,
  'completion_tokens': 12,
  'total_cost': 0.046884,
  'time': 1.1005311012268066},
 {'id': '4',
  'question': 'How deep is Lake Kariba?',
  'query_string': "SELECT depth FROM lake WHERE name = 