# Circuit learning method for SQL: training and test data generation for execution time prediction

This notebook contains the data preparation and generation for the circuit learning process. Running the this notebook produces the training and test data which can be found from the `data` folder. This code is represented for reproducibility reasons and there is no need to rerun it every time. The queries are simplified versions of Join Order Benchmark queries.

## Import queries

Training and test data are stored to `data` folder in json files.

In [2]:
# Read queries from the join order benchmark
import glob
import os
import re
import json
import psycopg2
from pathlib import Path

this_folder = os.path.abspath(os.getcwd()) + "/data"

In [3]:
query_path_training = glob.glob(this_folder + '/training_set/*')
query_path_validation = glob.glob(this_folder + "/validation_set/*")
query_path_test = glob.glob(this_folder + "/test_set/*")

In [4]:
def read_queries(files):
    queries = []
    for i, query in enumerate(files):
        base_name = Path(query).stem
        f = open(query, "r")
        queries.append({ 'name': base_name, 'query': f.read() })
    return queries
    
training_queries = read_queries(query_path_training)
validation_queries = read_queries(query_path_validation)
test_queries = read_queries(query_path_test)
        
print("Number of training queries is ", len(training_queries))
print("Number of validation queries is ", len(validation_queries))
print("Number of test queries is ", len(test_queries))

Number of training queries is  400
Number of validation queries is  50
Number of test queries is  50


### Generating training and test data

In [5]:
def genereta_data(queries, workload, ty):
    try:
        connection = psycopg2.connect(
        dbname="ergastF1",
        user="sql2circuits",
        password="privet834",
        host="localhost",
        port="5432"
         )
        print("Connected to database successfully")
    except psycopg2.Error as e:
        print(f"Error: {e}")

    cursor = connection.cursor()
    cursor.execute("SET statement_timeout = 20000; COMMIT;")
    shots_per_query = 10
    data = []
    file_name = "data/" + workload + "/" + ty + "_data.json"
    root_name = ty + "_data"
    
    for query in queries:
        try:
            total_running_time = 0.0
            for _ in range(shots_per_query):
                cursor = connection.cursor()
                cursor.execute("EXPLAIN ANALYZE " + query['query'])
                res = cursor.fetchall()
                ex_time = float(re.findall("\d+\.\d+", res[-1][0])[0])
                plan_time = float(re.findall("\d+\.\d+", res[-2][0])[0])
                total_running_time += ex_time + plan_time
            data.append( {'name': query['name'], 'time': round(total_running_time / shots_per_query, 4) })

        except (Exception, psycopg2.Error) as error:
            print("Error while fetching data from PostgreSQL", error)
            print(query)

    if connection:
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")
        
    with open(file_name, 'w') as outfile:
        json.dump({ root_name: data }, outfile)

In [5]:
workload = "execution_time"
genereta_data(training_queries, workload, "training")
genereta_data(validation_queries, workload, "validation")
genereta_data(test_queries, workload, "test")

Connected to database successfully
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': 'q_147', 'query': 'SELECT constructorresults.constructorresultsid, constructorresults.raceid FROM constructorresults, constructorstandings, results, status WHERE constructorresults.constructorid=constructorstandings.constructorid AND constructorresults.constructorid=results.constructorid AND constructorstandings.constructorid=results.constructorid AND results.statusid=status.statusid;\n'}
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': 'q_248', 'query': 'SELECT driverstandings.driverstandingsid, driverstandings.raceid FROM driverstandings, results, laptimes WHERE driverstandings.driverid=results.driverid AND driverstandings.driverid=laptimes.driverid AND laptimes.driverid=results.driverid;\n'}
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': 'q_110', 'query': '