#### Test simple cost model

In [68]:
# auto reload all modules
%load_ext autoreload
%autoreload 2

from simple_cost_model import *
from ssb_qgen_class import *

import time


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
 # load tpch static workload from a file
with open('../PostgreSQL/tpch_static_workload_1.pkl', 'rb') as f:
    workload_dict = pickle.load(f) 

workload_metadata = workload_dict['metadata']
workload = workload_dict['workload']    

print(f"Loaded static workload from file with {len(workload)} queries.")
print(f"Num rounds: {workload_metadata['num_rounds']}, Templates per round: {workload_metadata['template_sequence']}")



Loaded static workload from file with 1600 queries.
Num rounds: 100, Templates per round: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


In [70]:
def actual_cost(query_object, indexes):
    candidate_indexes = list(indexes.values())
    conn = create_connection(dbname="tpch10")
    # drop all existing secondary indexes 
    drop_all_indexes(conn)

    cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query_object.query_string, show_plan=False)
    print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
    print(f"Scan costs: {scan_costs}")

    # create hypothetical indexes for candidate indexes
    hypothetical_indexes = {}
    created_indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes, return_size=True)
    for i in range(len(created_indexes)):
        hypothetical_indexes[created_indexes[i][0]] = (candidate_indexes[i], created_indexes[i][1]) 
        index_oid, index_size_mb = created_indexes[i]
        #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

    # get the cost of the query with the hypothetical indexes
    print()
    cost_w_indexes, table_access_info, index_access_info, bitmap_heapscan_info = get_query_cost_estimate_hypo_indexes(conn, query_object.query_string, show_plan=False)
    print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
    print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")
    
    print(f"\nTable access info:")
    for table, info in table_access_info.items():
        print(f"\tTable --> {table}, Access info --> {info}")
    print(f"\nIndex access info:")
    for index_name, info in index_access_info.items():
        print(f"\tIndex --> {index_name}, Index scan type --> {info['scan_type']}, Cost --> {info['total_cost']}")
    print(f"\nBitmap heap scan info:")
    for index_name, info in bitmap_heapscan_info.items():
        print(f"\tIndex --> {index_name}, Bitmap Heapscan Cost --> {info['total_cost']}") 
        
    bulk_drop_hypothetical_indexes(conn)
    close_connection(conn)   

In [79]:
example_query = workload[4]
pk_indexes = tpch_pk_index_objects()

# extract candidate indexes
candidate_indexes = extract_query_indexes(example_query, max_key_columns=4, include_cols=True, dbname='tpch10', exclude_pk_indexes=True)
indexes = candidate_indexes + pk_indexes
indexes = {index.index_id: index for index in indexes}

In [80]:
print(example_query.template_id)
print(example_query.query_string)

for table in example_query.predicate_dict:
    for predicate in example_query.predicate_dict[table]:
        print(predicate)

5

                SELECT
                    o_orderpriority,
                    COUNT(*) AS order_count
                FROM
                    orders
                WHERE
                    o_totalprice > 80566.87270233559
                    AND o_orderstatus = 'O'
                    AND o_orderdate BETWEEN DATE '1996-03-25' AND DATE '1996-03-31'
                    AND o_orderpriority IN ('1-URGENT', '2-HIGH')
                GROUP BY
                    o_orderpriority
                ORDER BY
                    order_count DESC;
            
{'column': 'o_totalprice', 'operator': '>', 'value': 535662.545875736, 'join': False}
{'column': 'o_orderstatus', 'operator': 'eq', 'value': 'F', 'join': False}
{'column': 'o_orderdate', 'operator': 'range', 'value': ('1995-10-20', '1995-10-21'), 'join': False}
{'column': 'o_orderpriority', 'operator': 'in', 'value': "('1-URGENT', '2-HIGH')", 'join': False}


In [73]:
# Get the statistics for all tables in the tpch database
tables, pk_columns = get_tpch_schema()
table_names = list(tables.keys())
stats = {}
estimated_rows = {}
for table_name in table_names:
    stats[table_name], estimated_rows[table_name] = get_table_stats(table_name, dbname="tpch10")

In [74]:
# print out all the stats
for table_name, stat in stats.items():
    print(f"Table: {table_name}")
    for col, val in stat.items():
        print(f"\t{col}: {val}")
    print(f"Estimated rows: {estimated_rows[table_name]}")
    print()

Table: customer
	c_custkey: {'schemaname': 'public', 'tablename': 'customer', 'attname': 'c_custkey', 'inherited': False, 'null_frac': 0.0, 'avg_width': 4, 'n_distinct': -1.0, 'most_common_vals': None, 'most_common_freqs': None, 'histogram_bounds': '{48,13739,28223,43114,57647,74287,87624,103132,119400,135228,150438,165980,181707,197286,213683,228465,245088,260147,275241,291486,305590,319866,335190,350842,366747,381974,396273,411192,426374,441429,456241,471719,487613,503205,518941,534294,548605,563807,578671,592825,606748,622128,636525,650015,666988,680955,696531,714006,727653,740578,755468,770369,785605,800377,813602,829219,844642,860319,874997,890631,905998,920811,935492,949887,966870,984997,1000467,1014853,1027314,1043015,1057349,1073347,1087579,1101793,1116896,1130966,1144762,1159285,1173847,1187824,1201999,1214813,1230499,1245383,1259974,1274445,1289841,1304176,1318745,1335112,1350092,1365688,1381557,1396734,1412034,1424915,1439243,1454374,1470172,1484537,1499994}', 'correlation':

In [81]:
verbose = True

# instantiate simple cost model
#model = SimpleCost(example_query, stats, estimated_rows, index_scan_cost_multiplier=1.5, verbose=verbose)
model = SimpleCost(stats, estimated_rows, index_scan_cost_multiplier=1.0, dbname="tpch4")

# make prediction for example query
model.predict(example_query, indexes, verbose=verbose)

Tables and columns: {'orders': ['o_orderdate', 'o_totalprice', 'o_orderstatus', 'o_orderpriority']}
Payload: {'orders': ['o_orderpriority']}
Predicates:

orders
	{'column': 'o_totalprice', 'operator': '>', 'value': 535662.545875736, 'join': False}
	{'column': 'o_orderstatus', 'operator': 'eq', 'value': 'F', 'join': False}
	{'column': 'o_orderdate', 'operator': 'range', 'value': ('1995-10-20', '1995-10-21'), 'join': False}
	{'column': 'o_orderpriority', 'operator': 'in', 'value': "('1-URGENT', '2-HIGH')", 'join': False}
Predicates: {'orders': [0, 1, 2, 3]}
Join predicates: {}
Updated predicates: {'orders': [0, 1, 2, 3]}

Table predicate columns for orders: ['o_totalprice', 'o_orderstatus', 'o_orderdate', 'o_orderpriority']
Relevant predicate columns for orders: {'o_orderdate', 'o_totalprice', 'o_orderstatus', 'o_orderpriority'}
Payload columns for orders: ['o_orderpriority']
Checking index:  ix_orders_o_totalprice
Index scan possible!
Checking index:  ixn_orders_o_totalprice_o_or
Index 

(1.0, ['ix_orders_o_totalprice_o_orderstatus_o_orderpriority_o_orderdate'])

In [76]:
actual_cost(example_query, indexes)

Estimated cost without hypothetical indexes: 1423985.21
Scan costs: {'Seq Scan': 329562.0}


Estimated cost with hypothetical indexes: 1423425.57
Speedup: 1.0004

Table access info:
	Table --> part, Access info --> {'total_cost': 70992.0, 'actual_rows': None, 'actual_startup_time': None, 'actual_total_time': None, 'shared_hit_blocks': None, 'shared_read_blocks': None, 'local_hit_blocks': None, 'local_read_blocks': None}
	Table --> partsupp, Access info --> {'total_cost': 255056.0, 'actual_rows': None, 'actual_startup_time': None, 'actual_total_time': None, 'shared_hit_blocks': None, 'shared_read_blocks': None, 'local_hit_blocks': None, 'local_read_blocks': None}

Index access info:
	Index --> <13548>btree_supplier_s_acctbal_s_name_s_suppkey, Index scan type --> Index Only Scan, Cost --> 2954.36

Bitmap heap scan info:


#### Compare predicted and actual access paths for all query templates

In [82]:
simple_model_time = 0.0
whatif_time = 0.0

for i in range(1, 17):
    query_object = workload[i-1]
    print(query_object)
    candidate_indexes = extract_query_indexes(query_object, max_key_columns=4, include_cols=True, max_include_columns=3, exclude_pk_indexes=True, dbname='tpch10')
    indexes = {index.index_id: index for index in candidate_indexes + pk_indexes}
    t1 = time.time()
    # instantiate simple cost model
    #model = SimpleCost(example_query, stats, estimated_rows, index_scan_cost_multiplier=1.5, verbose=verbose)
    # predict the cost of the query
    cost, used = model.predict(query_object, indexes, verbose=False)
    t2 = time.time()
    simple_model_time += t2 - t1
    print(f"\nSimple model cost: {cost}")
    print(f"Predicted indexes used in the query plan: {used}")    
    print("\n")
    t1 = time.time()
    actual_cost(query_object, indexes)
    t2 = time.time()
    whatif_time += t2 - t1
    print("\n")

print(f"Simple model time: {simple_model_time}")
print(f"What-if time: {whatif_time}")


template id: 1, query: 
                SELECT
                    c.c_custkey,
                    o.o_orderdate,
                    SUM(l.l_extendedprice * (1 - l.l_discount)) AS total_spent
                FROM
                    customer c
                JOIN
                    orders o ON c.c_custkey = o.o_custkey
                JOIN
                    lineitem l ON o.o_orderkey = l.l_orderkey
                JOIN
                    nation n ON c.c_nationkey = n.n_nationkey
                JOIN
                    region r ON n.n_regionkey = r.r_regionkey
                WHERE
                    r.r_name = 'AMERICA'
                    AND l.l_shipdate BETWEEN DATE '1997-02-01' AND DATE '1997-03-02'
                GROUP BY
                    c.c_custkey,
                    o.o_orderdate
                ORDER BY
                    total_spent DESC;
            , payload: {'customer': ['c_custkey'], 'orders': ['o_orderdate'], 'lineitem': ['l_extendedprice', 'l_discount']