In [1]:
import csv
import re
import os
import random

folder = "src/main/resources/data"

# file needs update operations (in dependent order)
files = ['nation.tbl', 'customer.tbl', 'orders.tbl', 'lineitem.tbl']

# number of updates
n_updates = 0.1

# read the file into a list
def read_file(file):
    with open(folder + "/" + file, 'r') as tbl_file:
        tbl_reader = csv.reader(tbl_file, delimiter='|')
        return list(tbl_reader)
    
def select_updates(data, n_updates):
    # select a subset of the data
    # number of updates
    if n_updates < 1:
        n_updates = int(n_updates * len(data))
    else:
        n_updates = int(n_updates)

    # select the updates
    updates = random.sample(data, n_updates)

    return updates

init_ops = []
update_ops = []

for file in files:
    input_file = file
    print("Processing " + input_file)

    # read the file
    data = read_file(input_file)

    # select a subset of the data for updates
    updates = select_updates(data, n_updates)
    n_invalid_ops = max(1, int(len(updates)*0.5))
    
    # insert: +, delete: -
    insert_tag = f"+|{file.split('.')[0]}|"
    delete_tag = f"-|{file.split('.')[0]}|"
    # initialize the database (insert all tuples)
    init_ops += [insert_tag+'|'.join(i) for i in data]

    if input_file != "nation.tbl":
        # update operations
        update_ops += [insert_tag+'|'.join(u) for u in updates[:n_invalid_ops]] # invalid insertions
        update_ops += [delete_tag+'|'.join(u) for u in updates] # valid deletions
        update_ops += [delete_tag+'|'.join(u) for u in updates[:n_invalid_ops]] # invalid deletions

# write the operations to a file
with open(folder + "/ops_init.txt" , 'w') as op_file:
    for op in init_ops:
        op_file.write(op + '\n')
    
with open(folder + "/ops_update.txt" , 'w') as op_file:
    for op in update_ops:
        op_file.write(op + '\n')

with open(folder + "/ops_all.txt" , 'w') as op_file:
    for op in init_ops + update_ops:
        op_file.write(op + '\n')

print("Done!")

Processing nation.tbl
Processing customer.tbl
Processing orders.tbl
Processing lineitem.tbl
Done!


In [52]:
import csv
import re
import os
import random

folder = "src/main/resources/data"

# list all the files in the folder
files = os.listdir(folder)
n_insert = 10  # number of update tuples to generate for each file
n_delete = 10  # number of delete tuples to generate for each file

def generate_update_tags(k, insert_count=10, delete_count=10):
    tags = ['+'] * insert_count + ['-'] * delete_count + ['0'] * (k-insert_count-delete_count)
    random.shuffle(tags)
    return tags

# for each file, if it is a .tbl file, read it and output a .csv file
with open(folder + "/" + update_sql_file, 'w') as sql_file:
    for file in files:
        if re.search(r'\.tbl$', file):
            input_file = file
            output_file = re.sub(r'\.tbl$', '.csv', input_file)
            
            with open(folder + "/" + input_file, 'r') as tbl_file:
                print("Reading file: " + input_file)

                # count the number of rows in the file
                n_rows = sum(1 for _ in tbl_file)
                print("Number of rows: " + str(n_rows))

                # generate a list of random tags
                tags = generate_update_tags(n_rows, n_insert, n_delete)

                # reset the file pointer to the beginning of the file
                tbl_file.seek(0)

                with open(folder + "/" + output_file, 'w', newline='') as csv_file:
                    tbl_reader = csv.reader(tbl_file, delimiter='|')
                    csv_writer = csv.writer(csv_file)
                    
                    for row in tbl_reader:
                        tag = tags.pop()
                        if tag == 'i':
                            insert_query = f"INSERT INTO {re.sub(r'\.tbl$', '', input_file)} VALUES ({','.join(row)});"
                            sql_file.write(insert_query + "\n")
                        elif tag == 'd':
                            delete_query = "DELETE FROM " + re.sub(r'\.tbl$', '', input_file) + " WHERE " + " AND ".join([f + "=" + v for f, v in zip(["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"], row)]) + ";"
                            sql_file.write(delete_query + "\n")
                            csv_writer.writerow(row)
                        else:
                            csv_writer.writerow(row)


Reading file: region.tbl
Number of rows: 5
Reading file: supplier.tbl
Number of rows: 100
Reading file: nation.tbl
Number of rows: 25
Reading file: customer.tbl
Number of rows: 1500
Reading file: lineitem.tbl
Number of rows: 60175
Reading file: orders.tbl
Number of rows: 15000
Reading file: partsupp.tbl
Number of rows: 8000
Reading file: part.tbl
Number of rows: 2000


In [None]:
import csv
import re
import os

folder = "src/main/resources/data"

# list all the files in the folder
files = os.listdir(folder)

# for each file, if it is a .tbl file, read it and output a .csv file
for file in files:
    if re.search(r'\.tbl$', file):
        input_file = file
        output_file = re.sub(r'\.tbl$', '.csv', input_file)
        
        with open(folder + "/" + input_file, 'r') as tbl_file:
            with open(folder + "/" + output_file, 'w', newline='') as csv_file:
                tbl_reader = csv.reader(tbl_file, delimiter='|')
                csv_writer = csv.writer(csv_file)
                
                for row in tbl_reader:
                    csv_writer.writerow(row)