#### Test simple cost model

In [11]:
# auto reload all modules
%load_ext autoreload
%autoreload 2

from simple_cost_model import *
from ssb_qgen_class import *

import time


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
qgen = QGEN()

example_query = qgen.generate_query(1)
print(example_query)
print(example_query.predicate_dict)

template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1998
                AND lo_discount BETWEEN 3 AND 5 
                AND lo_quantity < 25;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}
{'lineorder': [{'column': 'lo_orderdate', 'operator': 'eq', 'value': 'd_datekey', 'join': True}, {'column': 'lo_discount', 'operator': 'range', 'value': (3, 5), 'join': False}, {'column': 'lo_quantity', 'operator': '<', 'value': 25, 'join': False}], 'dwdate': [{'column': 'd_year', 'operator': 'eq', 'value': 1998, 'join': False}]}


In [13]:
def actual_cost(query_object, indexes):
    candidate_indexes = list(indexes.values())
    conn = create_connection()
    # drop all existing secondary indexes 
    drop_all_indexes(conn)

    cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query_object.query_string, show_plan=False)
    print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
    print(f"Scan costs: {scan_costs}")

    # create hypothetical indexes for candidate indexes
    hypothetical_indexes = {}
    created_indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes, return_size=True)
    for i in range(len(created_indexes)):
        hypothetical_indexes[created_indexes[i][0]] = (candidate_indexes[i], created_indexes[i][1]) 
        index_oid, index_size_mb = created_indexes[i]
        #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

    # get the cost of the query with the hypothetical indexes
    print()
    cost_w_indexes, table_access_info, index_access_info, bitmap_heapscan_info = get_query_cost_estimate_hypo_indexes(conn, query_object.query_string, show_plan=False)
    print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
    print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")
    
    print(f"\nTable access info:")
    for table, info in table_access_info.items():
        print(f"\tTable --> {table}, Access info --> {info}")
    print(f"\nIndex access info:")
    for index_name, info in index_access_info.items():
        print(f"\tIndex --> {index_name}, Index scan type --> {info['scan_type']}, Cost --> {info['total_cost']}")
    print(f"\nBitmap heap scan info:")
    for index_name, info in bitmap_heapscan_info.items():
        print(f"\tIndex --> {index_name}, Bitmap Heapscan Cost --> {info['total_cost']}") 
        
    bulk_drop_hypothetical_indexes(conn)
    close_connection(conn)   

In [14]:
# extract candidate indexes
candidate_indexes = extract_query_indexes(example_query, max_key_columns=3, include_cols=True, exclude_pk_indexes=True)
indexes = {index.index_id: index for index in candidate_indexes} 

In [15]:
verbose = True

# Get the statistics for all tables in the SSB database
table_names = ["customer", "dwdate", "lineorder", "part", "supplier"]
stats = {}
estimated_rows = {}
for table_name in table_names:
    stats[table_name], estimated_rows[table_name] = get_table_stats(table_name)


# instantiate simple cost model
#model = SimpleCost(example_query, stats, estimated_rows, index_scan_cost_multiplier=1.5, verbose=verbose)
model = SimpleCost(stats, estimated_rows, index_scan_cost_multiplier=1.5)

# make prediction for example query
model.predict(example_query, indexes, verbose=verbose)

Tables and columns: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_extendedprice', 'lo_quantity'], 'dwdate': ['d_year', 'd_datekey']}
Payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}
Predicates:

lineorder
	{'column': 'lo_orderdate', 'operator': 'eq', 'value': 'd_datekey', 'join': True}
	{'column': 'lo_discount', 'operator': 'range', 'value': (3, 5), 'join': False}
	{'column': 'lo_quantity', 'operator': '<', 'value': 25, 'join': False}

dwdate
	{'column': 'd_year', 'operator': 'eq', 'value': 1998, 'join': False}
Predicates: {'lineorder': [0, 1, 2], 'dwdate': [3]}
Join predicates: {'dwdate': [4]}
Updated predicates: {'lineorder': [0, 1, 2], 'dwdate': [3, 4]}

Table predicate columns for lineorder: ['lo_orderdate', 'lo_discount', 'lo_quantity']
Relevant predicate columns for lineorder: {'lo_orderdate', 'lo_discount', 'lo_quantity'}
Payload columns for lineorder: ['lo_extendedprice', 'lo_discount']
Checking index:  ix_lineorder_lo_orderdate
Index scan possible!
Checking inde

(97455.75,
 ['ixn_lineorder_lo_discount_lo_orderdate_lo_quantity_lo_e',
  'ix_dwdate_d_year_d_datekey'])

In [16]:
actual_cost(example_query, indexes)

Estimated cost without hypothetical indexes: 1972556.69
Scan costs: {'Seq Scan': 1946433.73}


Estimated cost with hypothetical indexes: 119311.26
Speedup: 16.5329

Table access info:

Index access info:
	Index --> <13572>btree_lineorder_lo_orderdate_lo_discount_lo_quantity_lo_, Index scan type --> Index Only Scan, Cost --> 279.96
	Index --> <13585>btree_dwdate_d_year_d_datekey, Index scan type --> Index Only Scan, Cost --> 14.4

Bitmap heap scan info:


#### Compare predicted and actual access paths for all query templates

In [18]:
simple_model_time = 0.0
whatif_time = 0.0

for i in range(1, 15):
    query_object = qgen.generate_query(i)
    print(query_object)
    candidate_indexes = extract_query_indexes(query_object, max_key_columns=3, include_cols=True, exclude_pk_indexes=True)
    indexes = {index.index_id: index for index in candidate_indexes}
    t1 = time.time()
    # instantiate simple cost model
    #model = SimpleCost(example_query, stats, estimated_rows, index_scan_cost_multiplier=1.5, verbose=verbose)
    # predict the cost of the query
    cost, used = model.predict(query_object, indexes, verbose=False)
    t2 = time.time()
    simple_model_time += t2 - t1
    print(f"\nSimple model cost: {cost}")
    print(f"Predicted indexes used in the query plan: {used}")    
    print("\n")
    t1 = time.time()
    actual_cost(query_object, indexes)
    t2 = time.time()
    whatif_time += t2 - t1
    print("\n")

print(f"Simple model time: {simple_model_time}")
print(f"What-if time: {whatif_time}")


template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1997
                AND lo_discount BETWEEN 5 AND 7 
                AND lo_quantity < 25;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}

Simple model cost: 97324.65
Predicted indexes used in the query plan: ['ixn_lineorder_lo_discount_lo_orderdate_lo_quantity_lo_e', 'ix_dwdate_d_year_d_datekey']


Estimated cost without hypothetical indexes: 1972536.97
Scan costs: {'Seq Scan': 1946433.73}


Estimated cost with hypothetical indexes: 119540.67
Speedup: 16.5010

Table access info:

Index access info:
	Index --> <13572>btree_lineorder_lo_orderdate_lo_discount_lo_quantity_lo_, Index scan type --> Index Only Scan, Cost --