#### Simple experiments to study the benefits of Non-clustered indices

In [1]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

In [2]:
# first get a list of all tables in the database and print out their columns

connection = start_connection() 
tables = get_all_tables(connection)

# show details about columns, primary keys, row count 
for _, table in tables.items():
    print(f"\nTable: {table.table_name}")
    print(f"row count: {table.row_count}")
    print(f"Columns: {[c.column_name for _, c in table.columns.items()]}")
    print(f"Primary Key: {table.pk_columns}")

    # display the first 5 rows 
    display_table(connection, table, max_tuples=5)

close_connection(connection)


Table: customer
row count: 150000
Columns: ['c_custkey', 'c_name', 'c_address', 'c_nationkey', 'c_phone', 'c_acctbal', 'c_mktsegment', 'c_comment']
Primary Key: ['c_custkey']
   c_custkey              c_name                       c_address  c_nationkey          c_phone c_acctbal c_mktsegment                                          c_comment
0          1  Customer#000000001               IVhzIApeRb ot,c,E           15  25-989-741-2988    711.56   BUILDING    to the even, regular platelets. regular, ironi...
1          2  Customer#000000002  XSTf4,NCwDVaWNe6tEgvwfmRchLXak           13  23-768-687-3665    121.65   AUTOMOBILE  l accounts. blithely ironic theodolites integr...
2          3  Customer#000000003                    MG9kdTD2WBHm            1  11-719-748-3364   7498.12   AUTOMOBILE   deposits eat slyly ironic, even instructions....
3          4  Customer#000000004                     XxVSJsLAGtn            4  14-128-190-5944   2866.83   MACHINERY    requests. final, regular ide

In [26]:
def create_index(table_name, index_columns, include_columns):
    index_id = get_index_id(index_columns, table_name, include_columns)
    index_size = get_estimated_index_size(connection, table_name, index_columns + list(include_columns))
    print(f"index_id: {index_id}, index columns: {index_columns}, include columns: {include_columns}, index size: {index_size:.2f} Mb")
    # create the index 
    index = Index(table_name, index_id, index_columns, index_size, tuple(include_columns))
    index_creation_time = create_nonclustered_index_object(connection, index, verbose=True)
    return index_creation_time

#### Experiment 1: Simple Point Queries

In this simple experiment, we will execute this simple point query: `SELECT c_name, c_address, c_phone FROM customer WHERE c_name = 'Customer#000000123'`

Without an index on the customer table, the query execution will involve a sequential scan over the table heap to find the tuple that matches the predicate, i.e. customer name = `Customer#000000123`. This is significantly slower compared to if we have a non-clustered index on the `c_name` column, where the index (B+ tree) is sorted by customer name so we can locate the customer name by performing binary search on the tree, then following the record_id pointer to directly find the tuple in the table.

With additional include columns on `('c_address', 'c_phone')`, we then have a covering index, which further improves performance because we don't need to retreive any data from the table as all the attributes are available in the index itself (since index leaf nodes store all three attributes `c_name, c_address, c_phone`).


In [29]:
connection = start_connection()

# first, clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

# define a test query for selecting customer name, address and phone columns for a specific customer
query = """
        SELECT c_name, c_address, c_phone 
        FROM customer 
        WHERE c_name = 'Customer#000000123';
        """

"""
        Query execution with no index
"""
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with no index: {cost} seconds")


""" 
        Query execution with non clustered index on c_name and no include cloumns
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_name']
include_columns = ()
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"\nCost of query with index and no include columns: {cost} seconds")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

"""
        Query execution with non clustered index on c_name and include columns c_address, c_phone, this is a covering index for the query
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_name']
include_columns = ('c_address', 'c_phone')
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"\nCost of query with index and include columns: {cost} seconds")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

close_connection(connection)

All non-clustered indexes --> []
All nonclustered indexes removed.
Cost of query with no index: 0.109 seconds
index_id: IX_customer_c_name, index columns: ['c_name'], include columns: (), index size: 4.72 Mb
Created index --> [dbo].[customer].[IX_customer_c_name], Indexed Columns --> ['c_name'], Included Columns --> (), index creation time: 0.304 seconds

Cost of query with index and no include columns: 0.0 seconds
All non-clustered indexes --> [('dbo', 'customer', 'IX_customer_c_name')]
All nonclustered indexes removed.
index_id: IXN_customer_c_name_c_ad_c_ph, index columns: ['c_name'], include columns: ('c_address', 'c_phone'), index size: 10.73 Mb
Created index --> [dbo].[customer].[IXN_customer_c_name_c_ad_c_ph], Indexed Columns --> ['c_name'], Included Columns --> ('c_address', 'c_phone'), index creation time: 0.519 seconds

Cost of query with index and include columns: 0.0 seconds
All non-clustered indexes --> [('dbo', 'customer', 'IXN_customer_c_name_c_ad_c_ph')]
All nonclustere

#### Experiment 2: Range Queries

Now we will test on this query whioch has a range predicate: `SELECT c_name, c_address, c_nationkey FROM customer WHERE c_nationkey BETWEEN 10 AND 20`

Again, with no index, a full table scan will occur to find tuples that match the predicate, which is very slow.

Then with a non-clustered index on `c_nationkey` column, we don't see any benefit, even though the index is sorted by `c_nationkey` and we can get the tuple record ids with a sequential scan, the tuples are still unordered in the table w.r.t. `c_nationkey`, so we have to make random disk I/O for each tuple.

However, with additional include columns on `('c_address', 'c_phone')`, we see a speedup, because firstly, the index is already sorted by `c_nationkey` which means we can find all matching records in the index in single sequential scan and the leaf nodes already contain all attributes needed for the result.  





In [47]:
connection = start_connection()

# first, clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

# define a test query for selecting customer name, address and phone columns for a specific customer
query = """ SELECT c_nationkey, c_name, c_address 
            FROM customer 
            WHERE c_nationkey BETWEEN 10 AND 20; 
            """

"""        
        Query execution with no index
"""
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with no index: {cost} seconds\n")


""" 
        Query execution with non clustered index on c_nationkey and no include cloumns
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_nationkey']
include_columns = ()
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and no include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

"""
        Query execution with non clustered index on c_nationkey and include columns c_name, c_address, this is a covering index for the query
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_nationkey']
include_columns = ('c_name', 'c_address')
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

close_connection(connection)

All non-clustered indexes --> []
All nonclustered indexes removed.
Cost of query with no index: 0.176 seconds

index_id: IX_customer_c_nationkey, index columns: ['c_nationkey'], include columns: (), index size: 2.15 Mb
Created index --> [dbo].[customer].[IX_customer_c_nationkey], Indexed Columns --> ['c_nationkey'], Included Columns --> (), index creation time: 0.25 seconds
Cost of query with index and no include columns: 0.193 seconds

All non-clustered indexes --> [('dbo', 'customer', 'IX_customer_c_nationkey')]
All nonclustered indexes removed.
index_id: IXN_customer_c_nationkey_c_na_c_ad, index columns: ['c_nationkey'], include columns: ('c_name', 'c_address'), index size: 9.16 Mb
Created index --> [dbo].[customer].[IXN_customer_c_nationkey_c_na_c_ad], Indexed Columns --> ['c_nationkey'], Included Columns --> ('c_name', 'c_address'), index creation time: 0.286 seconds
Cost of query with index and include columns: 0.063 seconds

All non-clustered indexes --> [('dbo', 'customer', 'IX

#### Experiment 3: Join Queries

We will now test on the following join query: `SELECT c.c_custkey, c.c_name, o.o_orderkey, o.o_orderdate FROM customer c JOIN orders o ON c.c_custkey = o.o_custkey;`

The non clustered index on `c_custkey` of customer column (with or without the include column) does not benefit the join operation. Also,  `c_custkey` is the primary key of customer table so the table is already a clustered index.


In [55]:
connection = start_connection()

# first, clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

# define a test query for selecting customer name, address and phone columns for a specific customer
query = """ SELECT c.c_custkey, c.c_name, o.o_orderkey, o.o_orderdate 
            FROM customer c 
            JOIN orders o ON c.c_custkey = o.o_custkey;
            """

"""        
        Query execution with no index
"""
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with no index: {cost} seconds\n")


""" 
        Query execution with non clustered index on c_custkey and no include cloumns
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_custkey']
include_columns = ()
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and no include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)


"""
        Query execution with non clustered index on c_custkey and include columns c_name 
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_custkey']
include_columns = ('c_name',)
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

close_connection(connection)

All non-clustered indexes --> []
All nonclustered indexes removed.
Cost of query with no index: 4.051 seconds

index_id: IX_customer_c_custkey, index columns: ['c_custkey'], include columns: (), index size: 1.57 Mb
Created index --> [dbo].[customer].[IX_customer_c_custkey], Indexed Columns --> ['c_custkey'], Included Columns --> (), index creation time: 0.251 seconds
Cost of query with index and no include columns: 2.95 seconds

All non-clustered indexes --> [('dbo', 'customer', 'IX_customer_c_custkey')]
All nonclustered indexes removed.
index_id: IXN_customer_c_custkey_c_na, index columns: ['c_custkey'], include columns: ('c_name',), index size: 4.72 Mb
Created index --> [dbo].[customer].[IXN_customer_c_custkey_c_na], Indexed Columns --> ['c_custkey'], Included Columns --> ('c_name',), index creation time: 0.253 seconds
Cost of query with index and include columns: 3.366 seconds

All non-clustered indexes --> [('dbo', 'customer', 'IXN_customer_c_custkey_c_na')]
All nonclustered indexe

#### Experiment 4: Aggregation Queries

We will now test on the following aggregation queries which retreived the number of orders placed by each customer from the orders table:

`SELECT o_custkey, COUNT(*) AS orderCount FROM orders GROUP BY o_custkey`

Since the orders table is not sorted by `o_custkey`, without an index, a full table scan occurs which is slow.

With index on `o_custkey`, we might see a speed up since all records with the same customer id key are next to each other in the index, so the aggregation can be done in one pass.

Additional include columns in the index won't make any difference. 

In [58]:
connection = start_connection()

# first, clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

# define a test query for selecting customer name, address and phone columns for a specific customer
query = """ SELECT o_custkey, COUNT(*) AS orderCount 
            FROM orders 
            GROUP BY o_custkey;
            """

"""        
        Query execution with no index
"""
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with no index: {cost} seconds\n")


""" 
        Query execution with non clustered index on o_custkey and no include cloumns
"""
# now create a non clustered index on column c_name
table_name = 'orders'
index_columns = ['o_custkey']
include_columns = ()
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and no include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

"""
        Query execution with non clustered index on o_custkey and include columns o_orderdate 
"""
# now create a non clustered index on column c_name
table_name = 'orders'
index_columns = ['o_custkey']
include_columns = ('o_orderdate',)
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with index and include columns: {cost} seconds\n")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

close_connection(connection)

All non-clustered indexes --> []
All nonclustered indexes removed.
Cost of query with no index: 0.315 seconds

index_id: IX_orders_o_custkey, index columns: ['o_custkey'], include columns: (), index size: 21.46 Mb
Created index --> [dbo].[orders].[IX_orders_o_custkey], Indexed Columns --> ['o_custkey'], Included Columns --> (), index creation time: 1.82 seconds
Cost of query with index and no include columns: 0.658 seconds

All non-clustered indexes --> [('dbo', 'orders', 'IX_orders_o_custkey')]
All nonclustered indexes removed.
index_id: IXN_orders_o_custkey_o_or, index columns: ['o_custkey'], include columns: ('o_orderdate',), index size: 25.75 Mb
Created index --> [dbo].[orders].[IXN_orders_o_custkey_o_or], Indexed Columns --> ['o_custkey'], Included Columns --> ('o_orderdate',), index creation time: 1.945 seconds
Cost of query with index and include columns: 0.514 seconds

All non-clustered indexes --> [('dbo', 'orders', 'IXN_orders_o_custkey_o_or')]
All nonclustered indexes remove

#### We will now estimate the benefit of having a non-clustered index without actually creating the index. We will use a "what-if" analysis, where we create a "hypothetical index" and then given a query, ask the query optimizer to generate a query plan taking the hypothetical index into account and then obtain an estimated subtree cost from the query, which will be a proxy for the cost estimate of executing that query with that index avaiable. We will also obtain a query plan without taking that hypothetical index into account and get a cost estimate, for baseline comparison.

We define the `estimated benefit` as the ratio of estimated cost without the hypothetical index to estimated cost with the index.

In [33]:
average_cost_wo_index = 0
average_cost_w_index = 0
num_trials = 1

connection = start_connection()
drop_all_hypothetical_indexes(connection)

query = """
SELECT c.c_name, c.c_nationkey
FROM customer c
WHERE c.c_nationkey = 1;
"""

for _ in range(num_trials):

    cursor = connection.cursor()

    ### estimate the query execution cost without the hypothetical index
    estimated_cost_wo_index, non_clustered_index_usage_wo_index, clustered_index_usage_wo_index = estimate_query_cost(connection, query)
    average_cost_wo_index += estimated_cost_wo_index

    ### create a hypothetical index on the customer table
    hyp_index_id = "idx_hypothetical" 
    create_index_query = f""" 
                        CREATE NONCLUSTERED INDEX {hyp_index_id} 
                        ON customer(c_nationkey) 
                        INCLUDE(c_name)
                        WITH STATISTICS_ONLY = -1;
                        """

    # execute the create index query
    cursor.execute(create_index_query)
    connection.commit()

    ### enable the hypothetical index
    dbid_query = "SELECT DB_ID('TPCH1')"
    tabid_query = "SELECT OBJECT_ID('customer')"    
    indexid_query = "SELECT INDEXPROPERTY(OBJECT_ID('customer'), 'idx_hypothetical', 'IndexID')"
    cursor.execute(dbid_query)
    dbid = cursor.fetchone()[0]
    cursor.execute(tabid_query)
    tabid = cursor.fetchone()[0]
    cursor.execute(indexid_query)
    indexid = cursor.fetchone()[0]
    #print(f"dbid: {dbid}, tabid: {tabid}, indexid: {indexid}")

    enable_index_query = f"DBCC AUTOPILOT(0, {dbid}, {tabid}, {indexid}, 0, 0, 0)"
    cursor.execute(enable_index_query)

    ### execute a test query
    estimated_cost_w_index, non_clustered_index_usage_w_index, clustered_index_usage_w_index  = hypothetical_execute_query(connection, query)
    average_cost_w_index += estimated_cost_w_index

    ### drop the hypothetical index
    drop_index_query = f"DROP INDEX customer.{hyp_index_id}"
    cursor.execute(drop_index_query)
    connection.commit()
    cursor.close()

close_connection(connection)

average_cost_wo_index /= num_trials
average_cost_w_index /= num_trials
estimated_benefit = average_cost_wo_index / average_cost_w_index
print(f"Average cost without index: {average_cost_wo_index}")
print(f"Average cost with index: {average_cost_w_index}")
print(f"Estimated benefit: {estimated_benefit}")

# show index usage statistics
print(f"\nNon clustered index usage without hypothetical index: {non_clustered_index_usage_wo_index}")
print(f"Clustered index usage without hypothetical index: {clustered_index_usage_wo_index}")
print(f"\nNon clustered index usage with hypothetical index: {non_clustered_index_usage_w_index}")
print(f"Clustered index usage with hypothetical index: {clustered_index_usage_w_index}")


Average cost without index: 2.60458
Average cost with index: 0.00979298
Estimated benefit: 265.96398644743476

Non clustered index usage without hypothetical index: []
Clustered index usage without hypothetical index: [('customer', 0.0, 0, 2.60458, 150000.0, 5919.07)]

Non clustered index usage with hypothetical index: [('idx_hypothetical', 0.0, 0, 0.00979298, 5919.07, 5919.07)]
Clustered index usage with hypothetical index: []


#### Now let's compare the estimated benefit with the actual benefit.

In [40]:
average_cost_wo_index = 0
average_cost_w_index = 0
num_trials = 1

connection = start_connection()

remove_all_nonclustered_indexes(connection)
query = """
SELECT c.c_name, c.c_nationkey
FROM customer c
WHERE c.c_nationkey = 1;
"""

for _ in range(num_trials):

    cursor = connection.cursor()

    ### estimate the query execution cost without the hypothetical index
    estimated_cost_wo_index, non_clustered_index_usage_wo_index, clustered_index_usage_wo_index = execute_query(query, connection)
    average_cost_wo_index += estimated_cost_wo_index

    ### create index on the customer table
    real_index_id = "idx_real" 
    create_index_query = f""" 
                        CREATE NONCLUSTERED INDEX {real_index_id} 
                        ON customer(c_nationkey) 
                        INCLUDE(c_name)
                        """

    # execute the create index query
    cursor.execute(create_index_query)
    connection.commit()

    ### execute a test query
    estimated_cost_w_index, non_clustered_index_usage_w_index, clustered_index_usage_w_index= execute_query(query, connection)
    average_cost_w_index += estimated_cost_w_index


    ### drop the hypothetical index
    drop_index_query = f"DROP INDEX customer.{real_index_id}"
    cursor.execute(drop_index_query)
    connection.commit()
    cursor.close()

   
close_connection(connection)

average_cost_wo_index /= num_trials
average_cost_w_index /= num_trials
estimated_benefit = average_cost_wo_index / average_cost_w_index
print(f"Average cost without index: {average_cost_wo_index}")
print(f"Average cost with index: {average_cost_w_index}")
print(f"Actual benefit: {estimated_benefit}")

# show index usage statistics
print(f"\nNon clustered index usage without index: {non_clustered_index_usage_wo_index}")
print(f"Clustered index usage without index: {clustered_index_usage_wo_index}")
print(f"\nNon clustered index usage with index: {non_clustered_index_usage_w_index}")
print(f"Clustered index usage with index: {clustered_index_usage_w_index}")


All non-clustered indexes --> []
All nonclustered indexes removed.
Average cost without index: 0.066
Average cost with index: 0.005
Actual benefit: 13.200000000000001

Non clustered index usage without index: []
Clustered index usage without index: [('customer', 0.062, 66.0, 2.60458, 150000, 5919.07)]

Non clustered index usage with index: [('idx_real', 0.002, 5.0, 0.0290304, 5975, 5975.0)]
Clustered index usage with index: []
