# Circuit learning method for SQL: training and test data generation for cardinality prediction

## Import queries

Training and test data are stored to `data` folder in json files.

In [1]:
# Read queries from the join order benchmark
import glob
import os
import re
import json
import psycopg2
from pathlib import Path

this_folder = os.path.abspath(os.getcwd())

In [2]:
# Database credentials

port = "5432"
pg_db_name = "imdb2017"
pg_user = "postgres"
pg_pw = "0000"
imdb_file_path = "C://Users//valte//Documents//frozendata"

pg_connection = "postgresql://" + pg_user + ":" + pg_pw + "@localhost:" + port + "/" + pg_db_name

In [3]:
#size = "small"
#size = "medium"
#size = "large"
size = "main"
workload = "cardinality"

path = this_folder + "//queries//" + workload + "//" + size + "//"
query_path_training = glob.glob(path + "training//[0-9]*.sql")
query_path_validation = glob.glob(path + "validation//[0-9]*.sql")
query_path_test = glob.glob(path + "test//[0-9]*.sql")

In [4]:
def read_queries(files):
    queries = []
    for i, query in enumerate(files):
        base_name = Path(query).stem
        f = open(query, "r")
        queries.append({ 'name': base_name, 'query': f.read() })
    return queries
    
training_queries = read_queries(query_path_training)
validation_queries = read_queries(query_path_validation)
test_queries = read_queries(query_path_test)
        
print("Number of training queries is ", len(training_queries))
print("Number of validation queries is ", len(validation_queries))
print("Number of test queries is ", len(test_queries))

Number of training queries is  448
Number of validation queries is  113
Number of test queries is  112


### Generating training and test data

In [5]:
def genereta_data(queries, workload, ty, size):
    connection = psycopg2.connect(user=pg_user, password=pg_pw, host="localhost", port=port, database=pg_db_name)
    cursor = connection.cursor()
    cursor.execute("SET statement_timeout = 20000; COMMIT;")
    data = []
    file_name = "data//" + workload + "//" + size + "//" + ty + "_data.json"
    root_name = ty + "_data"
    
    for query in queries:
        try:
            cursor = connection.cursor()
            cursor.execute("EXPLAIN ANALYZE " + query['query'])
            res = cursor.fetchall()
            cardinality = int(re.findall("rows=(\d+)", res[0][0])[1])
            data.append({ 'name': query['name'], 'cardinality': cardinality })

        except (Exception, psycopg2.Error) as error:
            print("Error while fetching data from PostgreSQL", error)
            print(query)

    if connection:
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")
        
    with open(file_name, 'w') as outfile:
        json.dump({ root_name: data }, outfile)

In [6]:
genereta_data(training_queries, workload, "training", size)
genereta_data(validation_queries, workload, "validation", size)
genereta_data(test_queries, workload, "test", size)

Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': '127', 'query': "SELECT mc.note AS production_note FROM movie_companies AS mc, movie_info AS mi WHERE mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' AND mc.note IS NOT NULL AND mc.movie_id = mi.movie_id;"}
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': '128', 'query': "SELECT mc.note AS production_note FROM cast_info AS ci, movie_companies AS mc WHERE mc.note NOT LIKE '%(as Metro-Goldwyn-Mayer Pictures)%' AND mc.note IS NOT NULL AND ci.movie_id = mc.movie_id;"}
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': '328', 'query': 'SELECT t.title AS movie_title FROM cast_info AS ci, title AS t WHERE t.production_year < 2011 AND t.id = ci.movie_id;'}
Error while fetching data from PostgreSQL canceling statement due to statement timeout

{'name': '400', 'query': 'SELECT t.production_year AS mov