#### What-if Query Execution Cost estimation for Hypothetical indexes

In [11]:
%load_ext autoreload
%autoreload 2

from ssb_qgen_class import *
from pg_utils import *

import time


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Generate an SSB query

In [2]:
# create a query generator object
qg = QGEN()

In [3]:
query = qg.generate_query(9)
print(query)

template id: 9, query: 
                SELECT c_city, s_city, d_year, SUM(lo_revenue) AS revenue
                FROM customer, lineorder, supplier, dwdate
                WHERE lo_custkey = c_custkey
                AND lo_suppkey = s_suppkey
                AND lo_orderdate = d_datekey
                AND (c_city = 'IRAQ     0' OR c_city = 'ROMANIA  1')
                AND (s_city = 'IRAQ     0' OR s_city = 'ROMANIA  1')
                AND d_year >= 1995 AND d_year <= 1997
                GROUP BY c_city, s_city, d_year
                ORDER BY d_year ASC, revenue DESC;
            , payload: {'lineorder': ['lo_revenue'], 'dwdate': ['d_year'], 'customer': ['c_city'], 'supplier': ['s_city']}, predicates: {'lineorder': ['lo_custkey', 'lo_suppkey', 'lo_orderdate'], 'dwdate': ['d_year', 'd_datekey'], 'customer': ['c_custkey', 'c_city'], 'supplier': ['s_suppkey', 's_city']}, order by: {'lineorder': ['lo_revenue'], 'dwdate': ['d_year']}, group by: {'customer': ['c_city'], 'supplier': ['s

#### Generate candidate indexes (with include columns) for this query

In [4]:
candidate_indexes = extract_query_indexes(query, include_cols=True)

for index in candidate_indexes:
    print(index)

print(f"Total number of candidate indexes: {len(candidate_indexes)}")

Index name: IX_lineorder_lo_custkey, Key cols: ('lo_custkey',), Include cols: ()
Index name: IXN_lineorder_lo_custkey_lo_r, Key cols: ('lo_custkey',), Include cols: ('lo_revenue',)
Index name: IX_lineorder_lo_suppkey, Key cols: ('lo_suppkey',), Include cols: ()
Index name: IXN_lineorder_lo_suppkey_lo_r, Key cols: ('lo_suppkey',), Include cols: ('lo_revenue',)
Index name: IX_lineorder_lo_orderdate, Key cols: ('lo_orderdate',), Include cols: ()
Index name: IXN_lineorder_lo_orderdate_lo_r, Key cols: ('lo_orderdate',), Include cols: ('lo_revenue',)
Index name: IX_lineorder_lo_custkey_lo_suppkey, Key cols: ('lo_custkey', 'lo_suppkey'), Include cols: ()
Index name: IXN_lineorder_lo_custkey_lo_suppkey_lo_r, Key cols: ('lo_custkey', 'lo_suppkey'), Include cols: ('lo_revenue',)
Index name: IX_lineorder_lo_custkey_lo_orderdate, Key cols: ('lo_custkey', 'lo_orderdate'), Include cols: ()
Index name: IXN_lineorder_lo_custkey_lo_orderdate_lo_r, Key cols: ('lo_custkey', 'lo_orderdate'), Include cols:

#### Estimate cost without and without hypothetical indexes to get estimated speedup 

In [8]:
conn = create_connection()

cost_wo_indexes = get_query_cost_estimate(conn, query.query_string)
print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")

# create hypothetical indexes for candidate indexes

hypothetical_indexes = {}
indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes)
for i in range(len(indexes)):
    hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
    index_oid, index_size_mb = indexes[i]
    #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

# get the cost of the query with the hypothetical indexes
print()
cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

print("\nIndexes used in the query plan:")
for oid, scan_type in indexes_used:
    print(hypothetical_indexes[oid][0], ", Scan type: ", scan_type)

bulk_drop_hypothetical_indexes(conn)

close_connection(conn)    

Estimated cost without hypothetical indexes: 1308572.55


Estimated cost with hypothetical indexes: 25424.82
Speedup: 51.4683

Indexes used in the query plan:
Index name: IX_dwdate_d_year_d_datekey, Key cols: ('d_year', 'd_datekey'), Include cols: () , Scan type:  Index Only Scan
Index name: IX_supplier_s_suppkey_s_city, Key cols: ('s_suppkey', 's_city'), Include cols: () , Scan type:  Index Only Scan
Index name: IXN_lineorder_lo_suppkey_lo_orderdate_lo_custkey_lo_r, Key cols: ('lo_suppkey', 'lo_orderdate', 'lo_custkey'), Include cols: ('lo_revenue',) , Scan type:  Index Only Scan


In [None]:
conn = create_connection()

list_hypothetical_indexes(conn)

close_connection(conn)

No hypothetical indexes found.


#### Generate a query from each of the 13 templates and estimate speedup for each query with hypothetical candidate indexes, also determine which indexes get used in query plan 

In [13]:
for t in range(1,14):
    print("\n------------------------------------------------------------------------")
    # generate query
    query = qg.generate_query(t)
    print(query)
    # generate candidate indexes
    candidate_indexes = extract_query_indexes(query, include_cols=True)
    print(f"\nTotal number of candidate indexes generated: {len(candidate_indexes)}")
    
    # measure time taken to obtain estimated speedup with hypothetical indexes
    start_time = time.perf_counter()

    conn = create_connection()
    cost_wo_indexes = get_query_cost_estimate(conn, query.query_string)
    print(f"\nEstimated cost without hypothetical indexes: {cost_wo_indexes}")

    # create hypothetical indexes for candidate indexes

    hypothetical_indexes = {}
    indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes)
    for i in range(len(indexes)):
        hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
        index_oid, index_size_mb = indexes[i]
        #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

    # get the cost of the query with the hypothetical indexes
    print()
    cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
    print(f"Estimated cost with hypothetical indexes: {cost_w_indexes}")
    print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

    print("\nIndexes used in the query plan:")
    for oid, scan_type in indexes_used:
        print(hypothetical_indexes[oid][0], ", Scan type: ", scan_type)

    bulk_drop_hypothetical_indexes(conn)

    end_time = time.perf_counter()
    execution_time = end_time - start_time
    print(f"\nTotal time to obtain estimated speedup: {execution_time:.4f} seconds")
    
    close_connection(conn)   


------------------------------------------------------------------------
template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1998
                AND lo_discount BETWEEN 0 AND 2 
                AND lo_quantity < 50;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}

Total number of candidate indexes generated: 53

Estimated cost without hypothetical indexes: 1444418.9

Estimated cost with hypothetical indexes: 202199.83
Speedup: 7.1435

Indexes used in the query plan:
Index name: IXN_lineorder_lo_orderdate_lo_discount_lo_quantity_lo_e_lo_d, Key cols: ('lo_orderdate', 'lo_discount', 'lo_quantity'), Include cols: ('lo_extendedprice',) , Scan type:  Index Only Scan
Index na