#### What-if Query Execution Cost estimation for Hypothetical indexes

In [32]:
%load_ext autoreload
%autoreload 2

from ssb_qgen_class import *
from pg_utils import *

import time


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Generate an SSB query

In [33]:
# create a query generator object
qg = QGEN()

In [3]:
query = qg.generate_query(14)
print(query)

template id: 14, query: 
                SELECT lo_linenumber, lo_quantity, lo_orderdate  
                FROM lineorder
                WHERE lo_linenumber >= 4 AND lo_linenumber <= 5
                AND lo_quantity = 26;
            , payload: {'lineorder': ['lo_linenumber', 'lo_quantity', 'lo_orderdate']}, predicates: {'lineorder': ['lo_linenumber', 'lo_quantity']}, order by: {}, group by: {}


#### Generate candidate indexes (with include columns) for this query

In [4]:
candidate_indexes = extract_query_indexes(query, include_cols=True)

for index in candidate_indexes:
    print(index)

print(f"Total number of candidate indexes: {len(candidate_indexes)}")

Index name: IX_lineorder_lo_linenumber, Key cols: ('lo_linenumber',), Include cols: ()
Index name: IXN_lineorder_lo_linenumber_lo_q, Key cols: ('lo_linenumber',), Include cols: ('lo_quantity',)
Index name: IXN_lineorder_lo_linenumber_lo_o, Key cols: ('lo_linenumber',), Include cols: ('lo_orderdate',)
Index name: IXN_lineorder_lo_linenumber_lo_q_lo_o, Key cols: ('lo_linenumber',), Include cols: ('lo_quantity', 'lo_orderdate')
Index name: IX_lineorder_lo_quantity, Key cols: ('lo_quantity',), Include cols: ()
Index name: IXN_lineorder_lo_quantity_lo_l, Key cols: ('lo_quantity',), Include cols: ('lo_linenumber',)
Index name: IXN_lineorder_lo_quantity_lo_o, Key cols: ('lo_quantity',), Include cols: ('lo_orderdate',)
Index name: IXN_lineorder_lo_quantity_lo_l_lo_o, Key cols: ('lo_quantity',), Include cols: ('lo_linenumber', 'lo_orderdate')
Index name: IX_lineorder_lo_linenumber_lo_quantity, Key cols: ('lo_linenumber', 'lo_quantity'), Include cols: ()
Index name: IXN_lineorder_lo_linenumber_l

#### Estimate cost without and without hypothetical indexes to get estimated speedup 

In [8]:
conn = create_connection()

# drop all existing secondary indexes 
drop_all_indexes(conn)

cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query.query_string, show_plan=False)
print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
print(f"Scan costs: {scan_costs}")

# create hypothetical indexes for candidate indexes

hypothetical_indexes = {}
indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes, return_size=True)
for i in range(len(indexes)):
    hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
    index_oid, index_size_mb = indexes[i]
    #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

# get the cost of the query with the hypothetical indexes
print()
cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

print("\nIndexes used in the query plan:")
for oid, scan_type, scan_cost in indexes_used:
    print(hypothetical_indexes[oid][0], ", Scan type: ", scan_type, ", Scan cost: ", scan_cost)

bulk_drop_hypothetical_indexes(conn)

close_connection(conn)    

Estimated cost without hypothetical indexes: 1450573.39
Scan costs: {'Seq Scan': 1420780.49}


Estimated cost with hypothetical indexes: 16030.47
Speedup: 90.4885

Indexes used in the query plan:
Index name: IXN_lineorder_lo_quantity_lo_linenumber_lo_o, Key cols: ('lo_quantity', 'lo_linenumber'), Include cols: ('lo_orderdate',) , Scan type:  Index Only Scan , Scan cost:  16030.47


In [None]:
conn = create_connection()

list_hypothetical_indexes(conn)

close_connection(conn)

No hypothetical indexes found.


#### Generate a query from each of the 13 templates and estimate speedup for each query with hypothetical candidate indexes, also determine which indexes get used in query plan 

In [11]:
for t in range(1,2):
    print("\n------------------------------------------------------------------------")
    # generate query
    query = qg.generate_query(t)
    print(query)
    # generate candidate indexes
    candidate_indexes = extract_query_indexes(query, include_cols=True)
    print(f"\nTotal number of candidate indexes generated: {len(candidate_indexes)}")
    
    # measure time taken to obtain estimated speedup with hypothetical indexes
    start_time = time.perf_counter()

    conn = create_connection()
    cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query.query_string)
    print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
    print(f"Scan costs: {scan_costs}")

    # create hypothetical indexes for candidate indexes

    hypothetical_indexes = {}
    indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes)
    for i in range(len(indexes)):
        hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
        index_oid, index_size_mb = indexes[i]
        #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

    # get the cost of the query with the hypothetical indexes
    print()
    cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
    print(f"Estimated cost with hypothetical indexes: {cost_w_indexes}")
    print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

    print("\nIndexes used in the query plan:")
    for oid, scan_type, scan_cost in indexes_used:
        print(hypothetical_indexes[oid][0], ", Index Size (Mb): ", hypothetical_indexes[oid][1],", Scan type: ", scan_type, ", Scan cost: ", scan_cost)


    bulk_drop_hypothetical_indexes(conn)

    end_time = time.perf_counter()
    execution_time = end_time - start_time
    print(f"\nTotal time to obtain estimated speedup: {execution_time:.4f} seconds")
    
    close_connection(conn)   


------------------------------------------------------------------------
template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1997
                AND lo_discount BETWEEN 0 AND 2 
                AND lo_quantity < 50;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}

Total number of candidate indexes generated: 53
Estimated cost without hypothetical indexes: 1444432.12
Scan costs: {'Seq Scan': 1420859.44}

Estimated cost with hypothetical indexes: 202749.31
Speedup: 7.1242

Indexes used in the query plan:
Index name: IXN_lineorder_lo_orderdate_lo_discount_lo_quantity_lo_e_lo_d, Key cols: ('lo_orderdate', 'lo_discount', 'lo_quantity'), Include cols: ('lo_extendedprice',) ,

#### Now for each query, materialize the hypothetical indexes that get used in query plan, and compare hypothetical speedup with actual speedup

In [9]:
# first, drop all existing secondary indexes 
conn = create_connection()
drop_all_indexes(conn)
close_connection(conn)   

for t in range(1,15):
    print("\n------------------------------------------------------------------------")
    # generate query
    query = qg.generate_query(t)
    print(query)
    # generate candidate indexes
    candidate_indexes = extract_query_indexes(query, include_cols=True)
    print(f"\nTotal number of candidate indexes generated: {len(candidate_indexes)}")
    
    # measure time taken to obtain estimated speedup with hypothetical indexes
    start_time = time.perf_counter()

    conn = create_connection()
    cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query.query_string)
    print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
    print(f"Scan costs: {scan_costs}")

    # create hypothetical indexes for candidate indexes
    hypothetical_indexes = {}
    indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes, return_size=True)
    for i in range(len(indexes)):
        hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
        index_oid, index_size_mb = indexes[i]
        #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

    # get the cost of the query with the hypothetical indexes
    print()
    cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
    print(f"Estimated cost with hypothetical indexes: {cost_w_indexes}")
    print(f"Estimated Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

    print("\nIndexes used in the query plan:")
    indexes_to_materialize = {} # indexes_used may contain duplicates
    for oid, scan_type, scan_cost in indexes_used:
        print(hypothetical_indexes[oid][0], ", Index Size (Mb): ", hypothetical_indexes[oid][1],", Scan type: ", scan_type, ", Scan cost: ", scan_cost)
        indexes_to_materialize[oid] = hypothetical_indexes[oid][0]
    indexes_to_materialize = list(indexes_to_materialize.values())    

    bulk_drop_hypothetical_indexes(conn)

    end_time = time.perf_counter()
    execution_time = end_time - start_time
    print(f"\nTotal time to obtain estimated speedup: {execution_time:.4f} seconds")

    # get query execution time without indexes
    #start_time = time.perf_counter()
    query_execution_time_wo_indexes, _ = execute_query(conn, query.query_string)
    #end_time = time.perf_counter()
    #execution_time_wo_indexes = end_time - start_time
    print(f"\nTotal time to execute query without indexes: {query_execution_time_wo_indexes/1000:.4f} seconds")

    # materialize the indexes
    start_time = time.perf_counter()
    print(f"\nMaterializing {len(indexes_to_materialize)} indexes...")
    bulk_create_indexes(conn, indexes_to_materialize)
    end_time = time.perf_counter()
    execution_time = end_time - start_time
    print(f"\nTotal time to materialize indexes: {execution_time:.4f} seconds")

    # get query execution time with indexes
    #start_time = time.perf_counter()
    query_execution_time_w_indexes, _ = execute_query(conn, query.query_string)
    #end_time = time.perf_counter()
    #execution_time_w_indexes = end_time - start_time
    print(f"\nTotal time to execute query with indexes: {query_execution_time_w_indexes/1000:.4f} seconds")
    print(f"Actual Speedup with materialized indexes: {query_execution_time_wo_indexes/query_execution_time_w_indexes:.4f}\n")

    # drop the indexes
    start_time = time.perf_counter()
    bulk_drop_indexes(conn, indexes_to_materialize)
    end_time = time.perf_counter()
    execution_time = end_time - start_time
    print(f"\nTotal time to drop indexes: {execution_time:.4f} seconds")

    close_connection(conn)   


------------------------------------------------------------------------
template id: 1, query: 
                SELECT SUM(lo_extendedprice * lo_discount) AS revenue
                FROM lineorder, dwdate
                WHERE lo_orderdate = d_datekey
                AND d_year = 1993
                AND lo_discount BETWEEN 2 AND 4 
                AND lo_quantity < 12;
            , payload: {'lineorder': ['lo_extendedprice', 'lo_discount']}, predicates: {'lineorder': ['lo_orderdate', 'lo_discount', 'lo_quantity'], 'dwdate': ['d_datekey', 'd_year']}, order by: {}, group by: {}

Total number of candidate indexes generated: 42
Estimated cost without hypothetical indexes: 1426949.8
Scan costs: {'Seq Scan': 1420859.44}

Estimated cost with hypothetical indexes: 128779.18
Estimated Speedup: 11.0806

Indexes used in the query plan:
Index name: IXN_lineorder_lo_orderdate_lo_quantity_lo_discount_lo_e, Key cols: ('lo_orderdate', 'lo_quantity', 'lo_discount'), Include cols: ('lo_extendedprice

In [17]:
conn = create_connection()

# get the size of the database
database_size = get_database_size(conn)
print(f"\nDatabase size: {database_size} MB")

# get sizes of all tables
tables_sizes = get_all_table_sizes(conn)
print(f"\nTable sizes in MB: {tables_sizes}")    

close_connection(conn)   



Database size: 9140 MB

Table sizes in MB: {'lineorder': 8969, 'part': 111, 'customer': 46, 'supplier': 3, 'dwdate': 0}


#### Test Hypothetical Hiding and Unhiding of Indexes

In [34]:
# pick a test index
query = qg.generate_query(14)
print(query, "\n")

# generate candidate indexes
candidate_indexes = extract_query_indexes(query, include_cols=True)
for index in candidate_indexes:
    print(index)

print(f"Total number of candidate indexes: {len(candidate_indexes)}")

template id: 14, query: 
                SELECT lo_linenumber, lo_quantity, lo_orderdate  
                FROM lineorder
                WHERE lo_linenumber >= 3 AND lo_linenumber <= 4
                AND lo_quantity = 12;
            , payload: {'lineorder': ['lo_linenumber', 'lo_quantity', 'lo_orderdate']}, predicates: {'lineorder': ['lo_linenumber', 'lo_quantity']}, order by: {}, group by: {} 

Index name: ix_lineorder_lo_linenumber, Key cols: ('lo_linenumber',), Include cols: (), Current OID: None
Index name: ixn_lineorder_lo_linenumber_lo_o, Key cols: ('lo_linenumber',), Include cols: ('lo_orderdate',), Current OID: None
Index name: ixn_lineorder_lo_linenumber_lo_q, Key cols: ('lo_linenumber',), Include cols: ('lo_quantity',), Current OID: None
Index name: ixn_lineorder_lo_linenumber_lo_o_lo_q, Key cols: ('lo_linenumber',), Include cols: ('lo_orderdate', 'lo_quantity'), Current OID: None
Index name: ix_lineorder_lo_quantity, Key cols: ('lo_quantity',), Include cols: (), Current O

In [36]:
# check which indexes will be used in the query plan

conn = create_connection()

# drop all existing secondary indexes 
drop_all_indexes(conn)

cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query.query_string, show_plan=False)
print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
print(f"Scan costs: {scan_costs}")

# create hypothetical indexes for candidate indexes

hypothetical_indexes = {}
indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes, return_size=True)
for i in range(len(indexes)):
    hypothetical_indexes[indexes[i][0]] = (candidate_indexes[i], indexes[i][1]) 
    index_oid, index_size_mb = indexes[i]
    #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

# get the cost of the query with the hypothetical indexes
print()
cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

print("\nIndexes used in the query plan:")
indexes_to_materialize = {} # indexes_used may contain duplicates
for oid, scan_type, scan_cost in indexes_used:
    print(hypothetical_indexes[oid][0], ", Index Size (Mb): ", hypothetical_indexes[oid][1],", Scan type: ", scan_type, ", Scan cost: ", scan_cost)
    indexes_to_materialize[oid] = hypothetical_indexes[oid][0]
indexes_to_materialize = list(indexes_to_materialize.values())    

bulk_drop_hypothetical_indexes(conn)

close_connection(conn)    

Estimated cost without hypothetical indexes: 1460325.89
Scan costs: {'Seq Scan': 1420791.49}


Estimated cost with hypothetical indexes: 15242.3
Speedup: 95.8074

Indexes used in the query plan:
Index name: ixn_lineorder_lo_quantity_lo_linenumber_lo_o, Key cols: ('lo_quantity', 'lo_linenumber'), Include cols: ('lo_orderdate',), Current OID: None , Index Size (Mb):  1992.9140625 , Scan type:  Index Only Scan , Scan cost:  15242.3


In [37]:
print(indexes_to_materialize[0])

Index name: ixn_lineorder_lo_quantity_lo_linenumber_lo_o, Key cols: ('lo_quantity', 'lo_linenumber'), Include cols: ('lo_orderdate',), Current OID: None


In [38]:
# materialize the 'IXN_lineorder_lo_quantity_lo_linenumber_lo_o' index
conn = create_connection()
bulk_create_indexes(conn, indexes_to_materialize)
close_connection(conn)

Successfully created index 'ixn_lineorder_lo_quantity_lo_linenumber_lo_o': 1804.4843750000000000, creation time: 25.04 seconds


In [40]:
# check which indexes will be used in the query plan
conn = create_connection()


# hide the materialized indexes
bulk_hide_indexes(conn, indexes_to_materialize)
candidate_indexes_wo_materialized = [index for index in candidate_indexes if index not in indexes_to_materialize]

cost_wo_indexes, scan_costs = get_query_cost_estimate(conn, query.query_string, show_plan=True)
print(f"Estimated cost without hypothetical indexes: {cost_wo_indexes}")
print(f"Scan costs: {scan_costs}")

# create hypothetical indexes for candidate indexes
hypothetical_indexes = {}
indexes = bulk_create_hypothetical_indexes(conn, candidate_indexes_wo_materialized, return_size=True)
for i in range(len(indexes)):
    hypothetical_indexes[indexes[i][0]] = (candidate_indexes_wo_materialized[i], indexes[i][1]) 
    index_oid, index_size_mb = indexes[i]
    #print(f"Index {candidate_indexes[i].index_id} created with oid {index_oid} and size {index_size_mb} MB")

# get the cost of the query with the hypothetical indexes
print()
cost_w_indexes, indexes_used = get_query_cost_estimate_hypo_indexes(conn, query.query_string, show_plan=False)
print(f"\nEstimated cost with hypothetical indexes: {cost_w_indexes}")
print(f"Speedup: {cost_wo_indexes/cost_w_indexes:.4f}")

print("\nIndexes used in the query plan:")
#indexes_to_materialize = {} # indexes_used may contain duplicates
for oid, scan_type, scan_cost in indexes_used:
    print(hypothetical_indexes[oid][0], ", Index Size (Mb): ", hypothetical_indexes[oid][1],", Scan type: ", scan_type, ", Scan cost: ", scan_cost)
    #indexes_to_materialize[oid] = hypothetical_indexes[oid][0]
#indexes_to_materialize = list(indexes_to_materialize.values())    

# unhide the materialized indexes
bulk_unhide_indexes(conn, indexes_to_materialize)


bulk_drop_hypothetical_indexes(conn)

close_connection(conn)    

Index with OID 16493 hidden: True
[
  {
    "Plan": {
      "Node Type": "Gather",
      "Parallel Aware": false,
      "Async Capable": false,
      "Startup Cost": 1000.0,
      "Total Cost": 1460325.89,
      "Plan Rows": 385344,
      "Plan Width": 12,
      "Workers Planned": 2,
      "Single Copy": false,
      "Plans": [
        {
          "Node Type": "Seq Scan",
          "Parent Relationship": "Outer",
          "Parallel Aware": true,
          "Async Capable": false,
          "Relation Name": "lineorder",
          "Alias": "lineorder",
          "Startup Cost": 0.0,
          "Total Cost": 1420791.49,
          "Plan Rows": 160560,
          "Plan Width": 12,
          "Filter": "((lo_linenumber >= 3) AND (lo_linenumber <= 4) AND (lo_quantity = 12))"
        }
      ]
    }
  }
]
Estimated cost without hypothetical indexes: 1460325.89
Scan costs: {'Seq Scan': 1420791.49}


Estimated cost with hypothetical indexes: 47757.79
Speedup: 30.5778

Indexes used in the query plan