#### TPC-H Workload Generator

We will use dbgen/qgen to generate workloads for the TPC-H benchmark

In [11]:
import sys
import random
import pandas as pd
import os
import re
import json
import subprocess

In [57]:
# specify file paths
qgen_root_path = '/home/tanzid/Code/DBMS/tpch-dbgen-master/tpch-dbgen-master/'
workload_output_path = '/home/tanzid/Code/DBMS/datagen/TPCH_workloads/'

# sepcify subset of templates to use
templates = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22] # exclude template 15

# specify payloads, predicates, and order_bys for each template
payloads = {1: {"LINEITEM": ["L_RETURNFLAG", "L_LINESTATUS", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX"]},
            2: {"SUPPLIER": ["S_ACCTBAL", "S_NAME", "S_ADDRESS", "S_PHONE", "S_COMMENT"], "NATION": ["N_NAME"], "PART":
                ["P_PARTKEY", "P_MFGR"], "PARTSUPP": ["PS_SUPPLYCOST"]},
            3: {"LINEITEM": ["L_ORDERKEY", "L_EXTENDEDPRICE", "L_DISCOUNT"], "ORDERS": ["O_ORDERDATE",
                                                                                        "O_SHIPPRIORITY"]},
            4: {"ORDERS": ["O_ORDERPRIORITY"]},
            5: {"NATION": ["N_NAME"], "LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT"]},
            6: {"LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT"]},
            7: {"LINEITEM": ["L_SHIPDATE", "L_EXTENDEDPRICE", "L_DISCOUNT"], "NATION": ["N_NAME"]},
            8: {"ORDERS": ["O_ORDERDATE"], "LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT"], "NATION": ["N_NAME"]},
            9: {"ORDERS": ["O_ORDERDATE"], "LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT", "L_QUANTITY"], "PARTSUPP":
                ["PS_SUPPLYCOST"], "NATION": ["N_NAME"]},
            10: {"CUSTOMER": ["C_CUSTKEY", "C_NAME", "C_ACCTBAL", "C_ADDRESS", "C_PHONE", "C_COMMENT"], "LINEITEM":
                ["L_EXTENDEDPRICE", "L_DISCOUNT"], "NATION": ["N_NAME"]},
            11: {"PARTSUPP": ["PS_PARTKEY", "PS_SUPPLYCOST", "PS_AVAILQTY"]},
            12: {"LINEITEM": ["L_SHIPMODE"], "ORDERS": ["O_ORDERPRIORITY"]},
            13: {"CUSTOMER": ["C_CUSTKEY"], "ORDERS": ["O_ORDERKEY"]},
            14: {"PART": ["P_TYPE"], "LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT"]},
            16: {"PART": ["P_BRAND", "P_TYPE", "P_SIZE"]},
            17: {"LINEITEM": ["L_EXTENDEDPRICE", "L_QUANTITY"]},
            18: {"CUSTOMER": ["C_NAME"], "ORDERS": ["O_ORDERDATE", "O_TOTALPRICE"], "LINEITEM": ["L_QUANTITY"]},
            19: {"LINEITEM": ["L_EXTENDEDPRICE", "L_DISCOUNT"]},
            20: {"SUPPLIER": ["S_NAME", "S_ADDRESS"], "LINEITEM": ["L_QUANTITY"]},
            21: {"SUPPLIER": ["S_NAME"]},
            22: {"CUSTOMER": ["C_PHONE", "C_ACCTBAL"]}}

predicates = {1: {"LINEITEM": {"L_SHIPDATE": "r"}},
              2: {"PART": {"P_SIZE": "e", "P_TYPE": "c"}, "PARTSUPP": {"PS_SUPPKEY": "j", "PS_PARTKEY": "j"},
                  "SUPPLIER": {"S_NATIONKEY": "j"}, "NATION": {"N_REGIONKEY": "j"}, "REGION": {"R_NAME": "e"}},
              3: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERDATE": "r"}, "CUSTOMER": {"C_MKTSEGMENT": "e"}, "LINEITEM":
                  {"L_SHIPDATE": "r", "L_ORDERKEY": "j"}},
              4: {"ORDERS": {"O_ORDERKEY": "jpk", "O_ORDERDATE": "r"}, "LINEITEM":
                  {"L_ORDERKEY": "jpk", "L_COMMITDATE": "c", "L_RECEIPTDATE": "c"}},
              5: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERKEY": "jpk", "O_ORDERDATE": "r"},
                  "LINEITEM": {"L_ORDERKEY": "jpk", "L_SUPPKEY": "j"},
                  "CUSTOMER": {"C_CUSTKEY": "jpk", "C_NATIONKEY": "j"},
                  "SUPPLIER": {"S_NATIONKEY": "j", "S_SUPPKEY": "jpk"},
                  "NATION": {"N_NATIONKEY": "jpk", "N_REGIONKEY": "j"},
                  "REGION": {"R_REGIONKEY": "jpk", "R_NAME": "e"}},
              6: {"LINEITEM": {"L_SHIPDATE": "r", "L_DISCOUNT": "r", "L_QUANTITY": "r"}},
              7: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERKEY": "jpk"},
                  "LINEITEM": {"L_ORDERKEY": "jpk", "L_SUPPKEY": "j", "L_SHIPDATE": "r"},
                  "CUSTOMER": {"C_CUSTKEY": "jpk", "C_NATIONKEY": "j"},
                  "SUPPLIER": {"S_NATIONKEY": "j", "S_SUPPKEY": "jpk"},
                  "NATION": {"N_NATIONKEY": "jpk", "N_NAME": "e"}},
              8: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERKEY": "jpk", "O_ORDERDATE": "r"},
                  "LINEITEM": {"L_ORDERKEY": "jpk", "L_SUPPKEY": "j", "L_PARTKEY": "j"},
                  "CUSTOMER": {"C_CUSTKEY": "jpk", "C_NATIONKEY": "j"},
                  "SUPPLIER": {"S_NATIONKEY": "j", "S_SUPPKEY": "jpk"},
                  "NATION": {"N_NATIONKEY": "jpk", "N_REGIONKEY": "j"},
                  "REGION": {"R_REGIONKEY": "jpk", "R_NAME": "e"}, "PART": {"P_PARTKEY": "jpk", "P_TYPE": "e"}},
              9: {"PART": {"P_PARTKEY": "jpk", "P_NAME": "c"}, "SUPPLIER": {"S_NATIONKEY": "j", "S_SUPPKEY": "jpk"},
                  "LINEITEM": {"L_ORDERKEY": "jpk", "L_SUPPKEY": "j", "L_PARTKEY": "j"},
                  "PARTSUPP": {"PS_SUPPKEY": "jpk2", "PS_PARTKEY": "jpk"}, "ORDERS": {"O_ORDERKEY": "jpk"},
                  "NATION": {"N_NATIONKEY": "jpk"}},
              10: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERKEY": "jpk", "O_ORDERDATE": "r"},
                   "LINEITEM": {"L_ORDERKEY": "jpk", "L_RETURNFLAG": "e"},
                   "CUSTOMER": {"C_CUSTKEY": "jpk", "C_NATIONKEY": "j"}, "NATION": {"N_NATIONKEY": "jpk"}},
              11: {"PARTSUPP": {"PS_SUPPKEY": "jpk2"}, "SUPPLIER": {"S_SUPPKEY": "jpk", "S_NATIONKEY": "j"},
                   "NATION": {"N_NATIONKEY": "jpk", "N_NAME": "e"}},
              12: {"ORDERS": {"O_ORDERKEY": "jpk"},
                   "LINEITEM": {"L_ORDERKEY": "jpk", "L_SHIPMODE": "e", "L_RECEIPTDATE": "r",
                                "L_SHIPDATE": "r", "L_COMMITDATE": "r"}},
              13: {"CUSTOMER": {"C_CUSTKEY": "jpk"}, "ORDERS": {"O_COMMENT": "c", "O_CUSTKEY": "j"}},
              14: {"LINEITEM": {"L_SHIPDATE": "r", "L_PARTKEY": "j"}, "PART": {"P_PARTKEY": "jpk"}},
              16: {"PARTSUPP": {"PS_PARTKEY":"jpk", "PS_SUPPKEY":"jpk2"},
                   "PART": {"P_PARTKEY": "jpk", "P_SIZE": "e", "P_BRAND": "r", "P_TYPE": "c"},
                   "SUPPLIER": {"S_COMMENT":"c"}},
              17: {"PART": {"P_PARTKEY": "jpk", "P_CONTAINER": "e", "P_BRAND": "e"}, "LINEITEM": {"L_PARTKEY": "j"}},
              18: {"ORDERS": {"O_CUSTKEY": "j", "O_ORDERKEY": "jpk"}, "CUSTOMER": {"C_CUSTKEY": "jpk"},
                   "LINEITEM": {"L_ORDERKEY": "jpk"}},
              19: {"LINEITEM": {"L_SHIPMODE": "e", "L_PARTKEY": "j", "L_QUANTITY": "r", "L_SHIPINSTRUCT": "e"},
                   "PART": {"P_SIZE": "r", "P_PARTKEY": "jpk", "P_BRAND": "e", "P_CONTAINER": "e"}},
              20: {"SUPPLIER": {"S_SUPPKEY": "jpk", "S_NATIONKEY": "j"},
                   "PARTSUPP": {"PS_PARTKEY": "jpk", "PS_AVAILQTY": "r", "PS_SUPPKEY": "jpk2"},
                   "LINEITEM": {"L_SUPPKEY": "j", "L_PARTKEY": "j", "L_SHIPDATE": "r"},
                   "NATION": {"N_NATIONKEY": "jpk", "N_NAME": "e"}, "PART": {"P_NAME": "r"}},
              21: {"SUPPLIER": {"S_SUPPKEY": "jpk", "S_NATIONKEY": "j"},
                   "ORDERS": {"O_ORDERKEY": "jpk", "O_ORDERSTATUS": "e"},
                   "LINEITEM": {"L_SUPPKEY": "j", "L_ORDERKEY": "jpk", "L_RECEIPTDATE": "c", "L_COMMITDATE": "c"},
                   "NATION": {"N_NATIONKEY": "jpk", "N_NAME": "e"}},
              22: {"CUSTOMER": {"C_ACCTBAL": "r", "C_PHONE": "c", "C_CUSTKEY": "jpk"}, "ORDERS": {"O_CUSTKEY": "j"}}}

order_bys = {1: {"LINEITEM": [("L_RETURNFLAG", "ASC"), ("L_LINESTATUS", "ASC")]},
             2: {"SUPPLIER": [("S_ACCTBAL", "DESC"), ("S_NAME", "ASC")], "NATION": [("N_NAME", "ASC")], "PART": [("P_PARTKEY", "ASC")]},
             3: {"ORDERS": [("O_ORDERDATE", "ASC")]},
             4: {"ORDERS": [("O_ORDERPRIORITY", "ASC")]},
             5: {},
             6: {},
             7: {},
             8: {},
             9: {},
             10: {},
             11: {},
             12: {"LINEITEM": [("L_SHIPMODE", "ASC")]},
             13: {},
             14: {},
             16: {},
             17: {"LINEITEM": [("L_EXTENDEDPRICE", "ASC")]},
             18: {"ORDERS": [("O_TOTALPRICE", "DESC"), ("O_ORDERDATE", "ASC")]},
             19: {},
             20: {"SUPPLIER": [("S_NAME", "ASC")]},
             21: {"SUPPLIER": [("S_NAME", "ASC")]},
             22: {}}


# define a query object class
class Query:
    def __init__(self, template_id, query_string, payload, predicates, order_bys, benchmark="TPC-H"):
        self.template_id = template_id
        self.query_string = query_string
        self.payload = payload
        self.predicates = predicates
        self.order_bys = order_bys
        self.benchmark = benchmark
        self.group_by = {}

    def __str__(self):
        return f"template: {self.template}\n\query string: {self.query_string}\npayload: {self.payload}\npredicates: {self.predicates}\norder_bys: {self.order_bys}"


In [65]:
# workload generation params
# static mode params
num_rounds_static = 100  # each round will contain a query for each template in static mode

# random mode params
num_queries = 1000  # total number of queries in workload

# dynamic mode params
num_shifts = 5  # number of phases/shifts in workload
num_rounds_per_shift = 20  # number of rounds in each shift
num_templates_per_shift = 10  # number of templates in each shift


#### Generate the workload

In [66]:
original_dir = os.getcwd()

"""
    workload_type = 'dynamic'  # ['static', 'random', 'dynamic']
"""
def generate_workload(workload_type='static'):
    """ 
        For static workload, the templates appear periodically across the rounds. i.e. each round contains templates in a fixed order.
    """
    if workload_type == 'static':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_rounds_static}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for i in range(num_rounds_static):
                # pick a random seed for current round
                seed = random.randint(0, 10000000)
                # generate a query for each template
                for template_num in templates:  
                    qgen_command = f"./qgen -r {seed} {template_num}" 
                    # generate queries for the current template
                    try:
                        output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                        query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                        query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                        query_string = ''.join(query_string) + ';'
                        # create a query object
                        query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num])
                        # save to file
                        json.dump(query.__dict__, output_file)
                        output_file.write('\n')
                    except subprocess.CalledProcessError as e:
                        print(f"Error executing qgen with template {template_num}: {e}")
                    
            os.chdir(original_dir)  

    """
    For random workload, there is no periodicity and hence no rounds.
    """
    if workload_type == 'random':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_queries}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for i in range(num_queries):
                # pick a random seed
                seed = random.randint(0, 10000000)
                # pick a random template
                template_num = random.choice(templates)
                qgen_command = f"./qgen -r {seed} {template_num}"  
                # generate queries for the current template
                try:
                    output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                    query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                    query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                    query_string = ''.join(query_string) + ';'
                    # create a query object
                    query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num])
                    # save to file
                    json.dump(query.__dict__, output_file)
                    output_file.write('\n')
                except subprocess.CalledProcessError as e:
                    print(f"Error executing qgen with template {template_num}: {e}")
                    
            os.chdir(original_dir)


    """
    For dynamic workload, we have multiple phases/shifts. Within each shift, the queries appear periodically across rounds, i.e. each round contains some number of templates in a fixed order.
    """
    if workload_type == 'dynamic':
        # open json file to store queries
        output_filename = f"{workload_output_path}TPCH_{workload_type}_{num_shifts}_{num_templates_per_shift}_{num_rounds_per_shift}_workload.json" 
        with open(output_filename, 'w') as output_file:
            os.chdir(qgen_root_path)
            for shift in range(num_shifts):
                # choose templates for the current shift
                shift_templates = random.sample(templates, num_templates_per_shift)
                for i in range(num_rounds_per_shift):
                    # pick a random seed for current round
                    seed = random.randint(0, 10000000)
                    # generate a query for each template
                    for template_num in shift_templates:  
                        qgen_command = f"./qgen -r {seed} {template_num}"  
                        # generate queries for the current template
                        try:
                            output = subprocess.check_output(qgen_command, shell=True).decode('utf-8')
                            query_string = ''.join(output.split('\n\n\n')[1:]) # drop the first line
                            query_string = query_string.split(';')[-2:] # keep only the last two lines, since first line may contain errorneous statement
                            query_string = ''.join(query_string) + ';'
                            # create a query object
                            query = Query(template_num, query_string, payloads[template_num], predicates[template_num], order_bys[template_num])
                            # save to file
                            json.dump(query.__dict__, output_file)
                            output_file.write('\n')
                        except subprocess.CalledProcessError as e:
                            print(f"Error executing qgen with template {template_num}: {e}")
                        
            os.chdir(original_dir)


In [67]:
generate_workload('static')
generate_workload('random')
generate_workload('dynamic')