### Aquiring SQL Templates and Placeholders

In [1]:
import re
from utils import read_template_from_txt_files
all_templates = read_template_from_txt_files()


In [2]:
def extract_placeholders(sql_templates):
    """
    Extracts placeholders from a list of SQL query templates.
    
    Args:
    sql_templates (List[str]): A list of SQL query templates.

    Returns:
    List[List[str]]: A list of lists, each containing placeholders from a corresponding SQL template.
    """
    placeholders = []
    pattern = r"\[([^\]]+)\]"  # Pattern to match placeholders like [Some.Placeholder]

    for template in sql_templates:
        found = re.findall(pattern, template)
        placeholders.append(found)

    return placeholders

all_placeholders = {}
for key, templates in all_templates.items():
    # Extract placeholders
    placeholders = extract_placeholders(templates)
    all_placeholders[key] = placeholders
    



In [16]:
all_placeholders

{'Project': [['Project.Name'],
  ['Project.StartDate'],
  ['Project.Asset'],
  ['Project.Location'],
  ['Project.Status'],
  ['Project.StartDate', 'Project.EndDate'],
  ['Project.Asset', 'Project.Location']],
 'Capability': [['Capability.Name'], ['Capability.Name']],
 'Employee': [['Employee.Name'],
  ['Employee.JobTitle'],
  ['Employee.Location'],
  ['Employee.Department'],
  ['Employee.BusinessAddress'],
  ['Employee.Name'],
  ['Employee.Name'],
  ['Employee.Name'],
  ['Employee.Department'],
  ['Employee.Location'],
  ['Employee.JobTitle'],
  ['Employee.Department']],
 'Company': [['Company.Name'],
  ['Company.Name'],
  ['Company.Name'],
  ['Company.Industry'],
  ['Company.Industry'],
  ['Company.Industry']]}

## Generate SQL Queries and Answers
* Fill-in procedure

In [64]:
from langchain import SQLDatabase
from langchain.tools.sql_database.tool import QuerySQLDataBaseTool
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.sql.sqltypes import Date, DateTime, Time
import itertools
import datetime

class DBTool:
    def __init__(self, connection_string) -> None:
        
        self.engine = create_engine(connection_string)
        self.metadata = MetaData()

        # for query (TODO: decouple this from langchain)
        self.langchain_db = SQLDatabase.from_uri(connection_string, sample_rows_in_table_info=0)

    def query(self, sql):
        return self.langchain_db.run_no_throw(sql)
        # return self.engine.execute(query)
    
    def get_date_columns(self, table_name):

        table = Table(table_name, self.metadata, autoload=True, autoload_with=self.engine)
        return [column.name for column in table.columns if isinstance(column.type, (Date, DateTime, Time))]

db_tool = DBTool("mysql+pymysql://root:!wasdB793050@localhost:3306/Aurecon")


In [69]:

# def generate_queries(table_name, all_templates, all_placeholders, query_sql_database_tool):
#     sql_queries = []
#     for i, placeholders in enumerate(all_placeholders[table_name]):
#         fill_in_for_placeholders = {}
#         for placeholder in placeholders:
#             # print(placeholder)
            
#             # get fill-in values for the column
#             table_name = placeholder.split(".")[0]
#             column_name = placeholder.split(".")[1]

#             # select unique values from the column
#             query = f"SELECT DISTINCT {column_name} FROM {table_name};"
#             fill_in = query_sql_database_tool.run(query)
#             fill_in_for_placeholders[placeholder] = [t[0] for t in eval(fill_in)]
        

#         # fill-in the column name with the fill-in values
#         for fill_in_value in fill_in:
#             if fill_in_value == None:
#                 continue
#             sql_query = all_templates[table_name][i].replace('['+placeholder+']', fill_in_value)
#             # print(sql_query)
#             sql_queries.append(sql_query)
    
#     return sql_queries
            
def generate_queries(table_name, all_templates, all_placeholders, db_tool):
    sql_queries = []
    for placeholders, sql_query in zip(all_placeholders[table_name], all_templates[table_name]):
        fill_in_for_placeholders = {}
        for placeholder in placeholders:
            # Extract table and column name
            column_name = placeholder.split(".")[1]

            # Select unique values from the column
            query = f"SELECT DISTINCT {column_name} FROM {table_name};"
            fill_in = db_tool.query(query)
            fill_in_for_placeholders[placeholder] = [t[0] for t in eval(fill_in)]

        # Generate all combinations of fill-in values (Cartesian product)
        all_combinations = itertools.product(*fill_in_for_placeholders.values()) # fill_in_for_placeholders.values() is a list of lists

        # Fill-in the placeholders with each combination
        for combination in all_combinations: # combination is a tuple of values for each placeholder in order
            for placeholder, value in zip(placeholders, combination):
                if value is None:
                    continue
                sql_query = sql_query.replace('[' + placeholder + ']', str(value))
            sql_queries.append(sql_query)

    return sql_queries

generated_queries = {}

for table_name in all_templates:
    
    print('Table name: ', table_name)
    print('Date-type columns: ', db_tool.get_date_columns(table_name))
    
    generated_queries[table_name] = generate_queries(table_name, all_templates, all_placeholders, db_tool)
    print("The number of generated queries: ", len(generated_queries[table_name]), '\n')

Table name:  Project
Date-type columns:  ['StartDate', 'EndDate']
The number of generated queries:  50 

Table name:  Capability
Date-type columns:  []
The number of generated queries:  12 

Table name:  Employee
Date-type columns:  []
The number of generated queries:  33 

Table name:  Company
Date-type columns:  []
The number of generated queries:  15 



In [72]:
queries_answers = {}
for table_name, list_queries in generated_queries.items():
    print('Table name: ', table_name)
    query_answer_tuples = []
    for query in list_queries:
        answer = db_tool.query(query)
        query_answer_tuples.append((query, answer))
    queries_answers[table_name] = query_answer_tuples



Table name:  Project
Table name:  Capability
Table name:  Employee
Table name:  Company


In [73]:
queries_answers

{'Project': [("SELECT Name, Location, Status FROM Project WHERE Name = 'Carlton Innovation Precinct'",
   "[('Carlton Innovation Precinct', 'Melbourne, Victoria', 'inactive')]"),
  ("SELECT Name, Location, Status FROM Project WHERE Name = '25 King Street'",
   "[('25 King Street', 'Queensland', 'Active')]"),
  ("SELECT Name, Location, Status FROM Project WHERE Name = '5 Martin Place'",
   "[('5 Martin Place', 'Sydney', 'Active')]"),
  ("SELECT Name, Location, Status FROM Project WHERE Name = 'Australian Embassy Washington'",
   "[('Australian Embassy Washington', 'Washington', 'Inactive')]"),
  ("SELECT Name, StartDate, EndDate FROM Project WHERE StartDate >= '2018-01-03'",
   "[('Carlton Innovation Precinct', datetime.date(2018, 1, 3), datetime.date(2019, 8, 16))]"),
  ("SELECT Name, StartDate, EndDate FROM Project WHERE StartDate >= '2016-06-27'",
   "[('Carlton Innovation Precinct', datetime.date(2018, 1, 3), datetime.date(2019, 8, 16)), ('25 King Street', datetime.date(2016, 6, 27)