# Query generator

This module creates a collection of queries based on filtering and join clauses collected from Join Order Benchmark. These queries are used to train, validate and test the model.

In [1]:
import json
import os
from itertools import combinations

this_folder = os.path.abspath(os.getcwd())

f = open(this_folder + "//JOB_query_seed_execution_time.json", "r")
f = open(this_folder + "//JOB_query_seed_cardinality.json", "r")

query_seed = json.load(f)

#size = "small"
#size = "medium"
#size = "large"
size = "main"

workload = "execution_time"
workload = "cardinality"

path = "//queries//" + workload + "//" + size + "//"

In [2]:
def limit_query_seed(query_seed, num_of_filters, num_of_joins, num_of_selects):
    return {"selects": query_seed["selects"][:num_of_selects], 
            "joins": query_seed["joins"][:num_of_joins], 
            "filters": query_seed["filters"][:num_of_joins], 
            "table_aliases": query_seed["table_aliases"]}

In [3]:
#query_seed = limit_query_seed(query_seed, num_of_filters = 100, num_of_joins = 100, num_of_selects = 100)

In [4]:
def query_generator(query_seed, max_num_of_filters, max_num_of_joins, max_num_of_tables):
    queries, final_queries = [], []
    filters = query_seed["filters"]
    joins = query_seed["joins"]
    selects = query_seed["selects"]
    
    filter_combs, join_combs = [], []
    for i in range(1, max_num_of_filters + 1):
        filter_combs.append(list(combinations(filters, i)))
    for i in range(1, max_num_of_joins + 1):
        join_combs.append(list(combinations(joins, i)))
    
    for f in filter_combs:
        for c1 in f:
            for j in join_combs:
                for c2 in j:
                    table_aliases = list(set([v["table_alias"] for v in c1] + [v["table_alias1"] for v in c2] + [v["table_alias2"] for v in c2]))
                    if len(table_aliases) < max_num_of_tables + 1:
                        queries.append({"filters": c1, "joins": c2, "table_aliases": table_aliases})
                        
    for s in selects:
        for q in queries:
            if s["table_alias"] in q["table_aliases"]:
                final_queries.append({"select": s, "joins": q["joins"], "filters": q["filters"], "table_aliases": q["table_aliases"]})
    
    return final_queries

In [5]:
def construct_queries(queries, query_seed):
    aliases_to_tables = query_seed["table_aliases"]
    res = []
    for i, q in enumerate(queries):
        from_part = " FROM "
        where_part = " WHERE "
        for alias in q["table_aliases"]:
            from_part += aliases_to_tables[alias] + " AS " + alias + ", "
        for f in q["filters"]:
            where_part += f["filter"] + " AND "
        for j in q["joins"]:
            where_part += j["join"] + " AND "
            
        query = "SELECT " + q["select"]["select"] + from_part[:-2] + where_part[:-5] + ";"
        
        if i % 3 != 0:
            with open(this_folder + path + "training//" + str(i) + ".sql", "w") as output:
                output.write(query)
        else:
            if i % 2 == 0:
                with open(this_folder + path + "validation//" + str(i) + ".sql", "w") as output:
                    output.write(query)
            else:
                with open(this_folder + path + "test//" + str(i) + ".sql", "w") as output:
                    output.write(query)
                    

In [6]:
final_queries = query_generator(query_seed, max_num_of_filters = 2, max_num_of_joins = 2, max_num_of_tables = 2)
construct_queries(final_queries, query_seed)