#### TPC-H Workload Generator

We will use dbgen/qgen to generate workloads for the TPC-H benchmark

In [6]:
import sys
import random
import pandas as pd
import os
import re
import json
import subprocess

In [7]:
# specify file paths
qgen_root_path = '/home/tanzid/Code/DBMS/tpch-dbgen-master/tpch-dbgen-master/'
workload_output_path = '/home/tanzid/Code/DBMS/datagen/TPCH_workloads/'

# sepcify subset of templates to use
templates = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22] # exclude template 15

# specify payloads, predicates, and order_bys for each template
payloads = {1: {"lineitem": ["l_returnflag", "l_linestatus", "l_quantity", "l_extendedprice", "l_discount", "l_tax"]},
            2: {"supplier": ["s_acctbal", "s_name", "s_address", "s_phone", "s_comment"], "nation": ["n_name"], "part":
                ["p_partkey", "p_mfgr"], "partsupp": ["ps_supplycost"]},
            3: {"lineitem": ["l_orderkey", "l_extendedprice", "l_discount"], "orders": ["o_orderdate",
                                                                                        "o_shippriority"]},
            4: {"orders": ["o_orderpriority"]},
            5: {"nation": ["n_name"], "lineitem": ["l_extendedprice", "l_discount"]},
            6: {"lineitem": ["l_extendedprice", "l_discount"]},
            7: {"lineitem": ["l_shipdate", "l_extendedprice", "l_discount"], "nation": ["n_name"]},
            8: {"orders": ["o_orderdate"], "lineitem": ["l_extendedprice", "l_discount"], "nation": ["n_name"]},
            9: {"orders": ["o_orderdate"], "lineitem": ["l_extendedprice", "l_discount", "l_quantity"], "partsupp":
                ["ps_supplycost"], "nation": ["n_name"]},
            10: {"customer": ["c_custkey", "c_name", "c_acctbal", "c_address", "c_phone", "c_comment"], "lineitem":
                ["l_extendedprice", "l_discount"], "nation": ["n_name"]},
            11: {"partsupp": ["ps_partkey", "ps_supplycost", "ps_availqty"]},
            12: {"lineitem": ["l_shipmode"], "orders": ["o_orderpriority"]},
            13: {"customer": ["c_custkey"], "orders": ["o_orderkey"]},
            14: {"part": ["p_type"], "lineitem": ["l_extendedprice", "l_discount"]},
            16: {"part": ["p_brand", "p_type", "p_size"]},
            17: {"lineitem": ["l_extendedprice", "l_quantity"]},
            18: {"customer": ["c_name"], "orders": ["o_orderdate", "o_totalprice"], "lineitem": ["l_quantity"]},
            19: {"lineitem": ["l_extendedprice", "l_discount"]},
            20: {"supplier": ["s_name", "s_address"], "lineitem": ["l_quantity"]},
            21: {"supplier": ["s_name"]},
            22: {"customer": ["c_phone", "c_acctbal"]}}

predicates = {1: {"lineitem": {"l_shipdate": "r"}},
              2: {"part": {"p_size": "e", "p_type": "c"}, "partsupp": {"ps_suppkey": "j", "ps_partkey": "j"},
                  "supplier": {"s_nationkey": "j"}, "nation": {"n_regionkey": "j"}, "region": {"r_name": "e"}},
              3: {"orders": {"o_custkey": "j", "o_orderdate": "r"}, "customer": {"c_mktsegment": "e"}, "lineitem":
                  {"l_shipdate": "r", "l_orderkey": "j"}},
              4: {"orders": {"o_orderkey": "jpk", "o_orderdate": "r"}, "lineitem":
                  {"l_orderkey": "jpk", "l_commitdate": "c", "l_receiptdate": "c"}},
              5: {"orders": {"o_custkey": "j", "o_orderkey": "jpk", "o_orderdate": "r"},
                  "lineitem": {"l_orderkey": "jpk", "l_suppkey": "j"},
                  "customer": {"c_custkey": "jpk", "c_nationkey": "j"},
                  "supplier": {"s_nationkey": "j", "s_suppkey": "jpk"},
                  "nation": {"n_nationkey": "jpk", "n_regionkey": "j"},
                  "region": {"r_regionkey": "jpk", "r_name": "e"}},
              6: {"lineitem": {"l_shipdate": "r", "l_discount": "r", "l_quantity": "r"}},
              7: {"orders": {"o_custkey": "j", "o_orderkey": "jpk"},
                  "lineitem": {"l_orderkey": "jpk", "l_suppkey": "j", "l_shipdate": "r"},
                  "customer": {"c_custkey": "jpk", "c_nationkey": "j"},
                  "supplier": {"s_nationkey": "j", "s_suppkey": "jpk"},
                  "nation": {"n_nationkey": "jpk", "n_name": "e"}},
              8: {"orders": {"o_custkey": "j", "o_orderkey": "jpk", "o_orderdate": "r"},
                  "lineitem": {"l_orderkey": "jpk", "l_suppkey": "j", "l_partkey": "j"},
                  "customer": {"c_custkey": "jpk", "c_nationkey": "j"},
                  "supplier": {"s_nationkey": "j", "s_suppkey": "jpk"},
                  "nation": {"n_nationkey": "jpk", "n_regionkey": "j"},
                  "region": {"r_regionkey": "jpk", "r_name": "e"}, "part": {"p_partkey": "jpk", "p_type": "e"}},
              9: {"part": {"p_partkey": "jpk", "p_name": "c"}, "supplier": {"s_nationkey": "j", "s_suppkey": "jpk"},
                  "lineitem": {"l_orderkey": "jpk", "l_suppkey": "j", "l_partkey": "j"},
                  "partsupp": {"ps_suppkey": "jpk2", "ps_partkey": "jpk"}, "orders": {"o_orderkey": "jpk"},
                  "nation": {"n_nationkey": "jpk"}},
              10: {"orders": {"o_custkey": "j", "o_orderkey": "jpk", "o_orderdate": "r"},
                   "lineitem": {"l_orderkey": "jpk", "l_returnflag": "e"},
                   "customer": {"c_custkey": "jpk", "c_nationkey": "j"}, "nation": {"n_nationkey": "jpk"}},
              11: {"partsupp": {"ps_suppkey": "jpk2"}, "supplier": {"s_suppkey": "jpk", "s_nationkey": "j"},
                   "nation": {"n_nationkey": "jpk", "n_name": "e"}},
              12: {"orders": {"o_orderkey": "jpk"},
                   "lineitem": {"l_orderkey": "jpk", "l_shipmode": "e", "l_receiptdate": "r",
                                "l_shipdate": "r", "l_commitdate": "r"}},
              13: {"customer": {"c_custkey": "jpk"}, "orders": {"o_comment": "c", "o_custkey": "j"}},
              14: {"lineitem": {"l_shipdate": "r", "l_partkey": "j"}, "part": {"p_partkey": "jpk"}},
              16: {"partsupp": {"ps_partkey":"jpk", "ps_suppkey":"jpk2"},
                   "part": {"p_partkey": "jpk", "p_size": "e", "p_brand": "r", "p_type": "c"},
                   "supplier": {"s_comment":"c"}},
              17: {"part": {"p_partkey": "jpk", "p_container": "e", "p_brand": "e"}, "lineitem": {"l_partkey": "j"}},
              18: {"orders": {"o_custkey": "j", "o_orderkey": "jpk"}, "customer": {"c_custkey": "jpk"},
                   "lineitem": {"l_orderkey": "jpk"}},
              19: {"lineitem": {"l_shipmode": "e", "l_partkey": "j", "l_quantity": "r", "l_shipinstruct": "e"},
                   "part": {"p_size": "r", "p_partkey": "jpk", "p_brand": "e", "p_container": "e"}},
              20: {"supplier": {"s_suppkey": "jpk", "s_nationkey": "j"},
                   "partsupp": {"ps_partkey": "jpk", "ps_availqty": "r", "ps_suppkey": "jpk2"},
                   "lineitem": {"l_suppkey": "j", "l_partkey": "j", "l_shipdate": "r"},
                   "nation": {"n_nationkey": "jpk", "n_name": "e"}, "part": {"p_name": "r"}},
              21: {"supplier": {"s_suppkey": "jpk", "s_nationkey": "j"},
                   "orders": {"o_orderkey": "jpk", "o_orderstatus": "e"},
                   "lineitem": {"l_suppkey": "j", "l_orderkey": "jpk", "l_receiptdate": "c", "l_commitdate": "c"},
                   "nation": {"n_nationkey": "jpk", "n_name": "e"}},
              22: {"customer": {"c_acctbal": "r", "c_phone": "c", "c_custkey": "jpk"}, "orders": {"o_custkey": "j"}}}

order_bys = {1: {"lineitem": [("l_returnflag", "asc"), ("l_linestatus", "asc")]},
             2: {"supplier": [("s_acctbal", "desc"), ("s_name", "asc")], "nation": [("n_name", "asc")], "part": [("p_partkey", "asc")]},
             3: {"orders": [("o_orderdate", "asc")]},
             4: {"orders": [("o_orderpriority", "asc")]},
             5: {},
             6: {},
             7: {},
             8: {},
             9: {},
             10: {},
             11: {},
             12: {"lineitem": [("l_shipmode", "asc")]},
             13: {},
             14: {},
             16: {"part": [("p_brand", "asc"), ("p_type", "asc"), ("p_size", "asc")]},
             17: {"lineitem": [("l_extendedprice", "asc")]},
             18: {"orders": [("o_totalprice", "desc"), ("o_orderdate", "asc")]},
             19: {},
             20: {"supplier": [("s_name", "asc")]},
             21: {"supplier": [("s_name", "asc")]},
             22: {}}

group_bys = {1: {"lineitem": ["l_returnflag", "l_linestatus"]},
             2: {},
             3: {"lineitem": ["l_orderkey"], "orders": ["o_orderdate", "o_shippriority"]},
             4: {"orders": ["o_orderpriority"]},
             5: {},
             6: {},
             7: {"nation": ["n_name"]},
             8: {},
             9: {"nation": ["n_name"]},
             10: {"customer": ["c_custkey", "c_name", "c_acctbal", "c_phone", "c_address", "c_comment"], "nation": ["n_name"]},
             11: {},
             12: {"lineitem": ["l_shipmode"]},
             13: {},
             14: {},
             16: {"part": ["p_brand", "p_type", "p_size"]},
             17: {"lineitem": [("l_extendedprice", "asc")]},
             18: {"orders": ["o_orderkey", "o_totalprice", "o_orderdate"], "customer": ["c_name", "c_custkey"]},
             19: {},
             20: {},
             21: {"supplier": ["s_name"]},
             22: {}}


# define a query object class
class Query:
    def __init__(self, template_id, query_string, payload, predicates, order_bys, group_bys, benchmark="TPC-H"):
        self.template_id = template_id
        self.query_string = query_string
        self.payload = payload
        self.predicates = predicates
        self.order_bys = order_bys
        self.group_bys = group_bys
        self.benchmark = benchmark

    def __str__(self):
        return f"template: {self.template}\n\query string: {self.query_string}\npayload: {self.payload}\npredicates: {self.predicates}\norder_bys: {self.order_bys}"


In [8]:
# workload generation params
# static mode params
num_rounds_static = 100  # each round will contain a query for each template in static mode

# random mode params
num_queries = 1000  # total number of queries in workload

# dynamic mode params
num_shifts = 5  # number of phases/shifts in workload
num_rounds_per_shift = 20  # number of rounds in each shift
num_templates_per_shift = 10  # number of templates in each shift


#### Generate the workload

In [9]:
original_dir = os.getcwd()

"""
    workload_type = 'dynamic'  # ['static', 'random', 'dynamic']
"""
def generate_workload(workload_type='static'):
    """ 
        For static workload, the templates appear periodically across the rounds. i.e. each round contains templates in a fixed order.
    """
    if workload_type == 'static':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_rounds_static}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for i in range(num_rounds_static):
                # pick a random seed for current round
                seed = random.randint(0, 10000000)
                # generate a query for each template
                for template_num in templates:  
                    qgen_command = f"./qgen -r {seed} {template_num}" 
                    # generate queries for the current template
                    try:
                        output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                        query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                        query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                        query_string = ''.join(query_string) + ';'
                        # create a query object
                        query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num], group_bys[template_num])
                        # save to file
                        json.dump(query.__dict__, output_file)
                        output_file.write('\n')
                    except subprocess.CalledProcessError as e:
                        print(f"Error executing qgen with template {template_num}: {e}")
                    
            os.chdir(original_dir)  

    """
    For random workload, there is no periodicity and hence no rounds.
    """
    if workload_type == 'random':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_queries}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for i in range(num_queries):
                # pick a random seed
                seed = random.randint(0, 10000000)
                # pick a random template
                template_num = random.choice(templates)
                qgen_command = f"./qgen -r {seed} {template_num}"  
                # generate queries for the current template
                try:
                    output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                    query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                    query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                    query_string = ''.join(query_string) + ';'
                    # create a query object
                    query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num], group_bys[template_num])
                    # save to file
                    json.dump(query.__dict__, output_file)
                    output_file.write('\n')
                except subprocess.CalledProcessError as e:
                    print(f"Error executing qgen with template {template_num}: {e}")
                    
            os.chdir(original_dir)


    """
    For dynamic workload, we have multiple phases/shifts. Within each shift, the queries appear periodically across rounds, i.e. each round contains some number of templates in a fixed order.
    """
    if workload_type == 'dynamic':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_shifts}_{num_templates_per_shift}_{num_rounds_per_shift}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for shift in range(num_shifts):
                # choose templates for the current shift
                shift_templates = random.sample(templates, num_templates_per_shift)
                for i in range(num_rounds_per_shift):
                    # pick a random seed for current round
                    seed = random.randint(0, 10000000)
                    # generate a query for each template
                    for template_num in shift_templates:  
                        qgen_command = f"./qgen -r {seed} {template_num}"  
                        # generate queries for the current template
                        try:
                            output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                            query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                            query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                            query_string = ''.join(query_string) + ';'
                            # create a query object
                            query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num], group_bys[template_num])
                            # save to file
                            json.dump(query.__dict__, output_file)
                            output_file.write('\n')
                        except subprocess.CalledProcessError as e:
                            print(f"Error executing qgen with template {template_num}: {e}")
                        
            os.chdir(original_dir)


In [10]:
generate_workload('static')
generate_workload('random')
generate_workload('dynamic')