1. Add operation and database identifier

In [6]:
import csv
import random
import numpy as np

rand_insert_ratio = 0.5

def read_txt_file(fpath):
    with open(fpath, 'r') as txt_file:
        return txt_file.readlines()
    
def read_raw_data(fpath):
    with open(fpath, 'r') as tbl_file:
        tbl_reader = csv.reader(tbl_file, delimiter='|')
        return list(tbl_reader)

# for sf in ['0.2', '0.4', '0.6', '0.8', '1.0']:
for sf in ['0.1']:

    folder = "tpch_datasets/data_sf" + sf
    files = ['nation.tbl', 'customer.tbl', 'orders.tbl', 'lineitem.tbl']

    init_ops = []
    update_ops = []

    for file in files:
        data = read_raw_data(folder + "/" + file)
        
        # insert: +, delete: -
        insert_tag = f"+|{file.split('.')[0]}|"
        delete_tag = f"-|{file.split('.')[0]}|"
        
        # insert-only operations
        init_ops += [insert_tag+'|'.join(i) for i in data]
        
        # random-update operations
        rand_insert_num = int(max(10, int(len(data) * rand_insert_ratio)))
        rand_insert_ops = random.sample(data, rand_insert_num)
        rand_delete_ops = random.sample(rand_insert_ops, int(rand_insert_num*0.5))
        rand_delete_ops += random.sample(data, int(rand_insert_num*0.5))

        tmp_update_ops = []
        for u in rand_insert_ops:
            tmp_update_ops.append(insert_tag + '|'.join(u))
        for u in rand_delete_ops:
            tmp_update_ops.append(delete_tag + '|'.join(u))
        update_ops += tmp_update_ops
        
    with open(folder + f"/ops_sf{sf}_init.txt" , 'w') as op_file:
        for op in init_ops:
            op_file.write(op + '\n')

    random.shuffle(update_ops)
    with open(folder + f"/ops_sf{sf}_update.txt" , 'w') as op_file:
        for op in update_ops:
            op_file.write(op + '\n')

print("Done!")

Done!


2. Split Dataset

3. Convert the operation to SQL insert/delete statements

In [19]:
fieldnames = {
    "nation": ["N_NATIONKEY", "N_NAME", "N_REGIONKEY", "N_COMMENT"],
    "customer": ["C_CUSTKEY", "C_NAME", "C_ADDRESS", "C_NATIONKEY", "C_PHONE", "C_ACCTBAL", "C_MKTSEGMENT", "C_COMMENT"],
    "orders": ["O_ORDERKEY", "O_CUSTKEY", "O_ORDERSTATUS", "O_TOTALPRICE", "O_ORDERDATE", "O_ORDERPRIORITY", "O_CLERK", "O_SHIPPRIORITY", "O_COMMENT"],
    "lineitem": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"]
}

def convert_to_sql(ops):
    
    sql_statements = []
    
    for op in ops:
        table_name = op.split('|')[1]
        values = op.split('|')[2:]
        if '\n' in values:
            values.remove('\n')
        if op.startswith("+"):
            values = [f"'{v}'" if not v.isdigit() else v for v in values]
            values = [f"'{v}'" if v == "null" else v for v in values]
            sql = f"INSERT INTO {table_name} VALUES ({', '.join(values)});"
        elif op.startswith("-"):
            conditions = [f"{fieldnames[table_name][i]} = '{c}'" if not c.isdigit() else f"{fieldnames[table_name][i]} = {c}" for i, c in enumerate(values)]
            sql = f"DELETE FROM {table_name} WHERE { ' AND '.join(conditions)};"
        sql_statements.append(sql)
    
    return sql_statements


In [None]:
for sf in ['0.2', '0.4', '0.6', '0.8', '1.0']:
    folder = "tpch_datasets/data_sf" + sf
    
    for fname in [f'ops_sf{sf}_init.txt', f'ops_sf{sf}_update.txt']:
        sql_statements = convert_to_sql(folder + "/" + fname)

        with open("mysql_test/" + fname.replace("txt", "sql") , 'w') as sql_file:
            sql_file.write("USE tpch;\n")
            for statement in sql_statements:
                sql_file.write(statement + '\n')


In [21]:
# number of monitoring points
n_split = 5 

sf = 0.1

folder = "tpch_datasets/data_sf" + str(sf)
fname = f"/ops_sf{sf}_update.txt"

def split_data(data, n_subset=5):
    data_splits = np.array_split(data, n_subset)
    return [list(split) for split in data_splits]

update_ops = read_txt_file(folder + fname)
subsets = split_data(update_ops, 5)

for i in range(5):
    with open(folder + fname.replace(".txt", f"_{(i+1)/n_split}.txt") , 'w') as op_file:
        for s in subsets[:i+1]:
            for op in s:
                op_file.write(op)

    with open("mysql_test/" + fname.replace(".txt", f"_{(i+1)/n_split}.sql") , 'w') as sql_file:
        ops = subsets[i]
        sql_statements = convert_to_sql(ops)
        sql_file.write("USE tpch;\n")
        for statement in sql_statements:
            sql_file.write(statement + '\n')