#### Simple experiments with Non-clustered indices

In [15]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# first get a list of all tables in the database and print out their columns

connection = start_connection() 
tables = get_all_tables(connection)

# show details about columns, primary keys, row count 
for _, table in tables.items():
    print(f"\nTable: {table.table_name}")
    print(f"row count: {table.row_count}")
    print(f"Columns: {[c.column_name for _, c in table.columns.items()]}")
    print(f"Primary Key: {table.pk_columns}")

    # display the first 5 rows 
    display_table(connection, table, max_tuples=5)

close_connection(connection)


Table: customer
row count: 150000
Columns: ['c_custkey', 'c_name', 'c_address', 'c_nationkey', 'c_phone', 'c_acctbal', 'c_mktsegment', 'c_comment']
Primary Key: ['c_custkey']
   c_custkey              c_name                       c_address  c_nationkey          c_phone c_acctbal c_mktsegment                                          c_comment
0          1  Customer#000000001               IVhzIApeRb ot,c,E           15  25-989-741-2988    711.56   BUILDING    to the even, regular platelets. regular, ironi...
1          2  Customer#000000002  XSTf4,NCwDVaWNe6tEgvwfmRchLXak           13  23-768-687-3665    121.65   AUTOMOBILE  l accounts. blithely ironic theodolites integr...
2          3  Customer#000000003                    MG9kdTD2WBHm            1  11-719-748-3364   7498.12   AUTOMOBILE   deposits eat slyly ironic, even instructions....
3          4  Customer#000000004                     XxVSJsLAGtn            4  14-128-190-5944   2866.83   MACHINERY    requests. final, regular ide

In [26]:
def create_index(table_name, index_columns, include_columns):
    index_id = get_index_id(index_columns, table_name, include_columns)
    index_size = get_estimated_index_size(connection, table_name, index_columns + list(include_columns))
    print(f"index_id: {index_id}, index columns: {index_columns}, include columns: {include_columns}, index size: {index_size:.2f} Mb")
    # create the index 
    index = Index(table_name, index_id, index_columns, index_size, tuple(include_columns))
    index_creation_time = create_nonclustered_index_object(connection, index, verbose=True)
    return index_creation_time

#### Exp1: Simple SELECT queries

In this simple experiment, we will execute this simple select query: `SELECT c_name, c_address, c_phone FROM customer WHERE c_name = 'Customer#000000123'`

Without an index on the customer table, the query execution will involve a sequential scan over the table heap to find the tuple that matches the predicate, i.e. customer name = `Customer#000000123`. This is significantly slower compared to if we have a non-clustered index on the c_name column, where the index (B+ tree) is sorted by customer name so we can locate the customer name by performing binary search on the tree, then following the record_id pointer to directly find the tuple in the table.

With include columns on `('c_address', 'c_phone')`, we then have a covering index, which further improves performance because we don't need to retreive any data from the table itself, all the attributes are available from the index itself since index leaf nodes store all three attributes `c_name, c_address, c_phone`.


In [29]:
connection = start_connection()

# first, clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

# define a test query for selecting customer name, address and phone columns for a specific customer
query = """
        SELECT c_name, c_address, c_phone FROM customer WHERE c_name = 'Customer#000000123';
        """

"""
        Query execution with no index
"""
cost , _, _ = execute_query(query, connection)
print(f"Cost of query with no index: {cost} seconds")


""" 
        Query execution with non clustered index on c_name and no include cloumns
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_name']
include_columns = ()
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"\nCost of query with index and no include columns: {cost} seconds")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

"""
        Query execution with non clustered index on c_name and include columns c_address, c_phone, this is a covering index for the query
"""
# now create a non clustered index on column c_name
table_name = 'customer'
index_columns = ['c_name']
include_columns = ('c_address', 'c_phone')
create_index(table_name, index_columns, include_columns)
# execute the query
cost , _, _ = execute_query(query, connection)
print(f"\nCost of query with index and include columns: {cost} seconds")

# clear out all existing non clustered indices
remove_all_nonclustered_indexes(connection)

close_connection(connection)

All non-clustered indexes --> []
All nonclustered indexes removed.
Cost of query with no index: 0.109 seconds
index_id: IX_customer_c_name, index columns: ['c_name'], include columns: (), index size: 4.72 Mb
Created index --> [dbo].[customer].[IX_customer_c_name], Indexed Columns --> ['c_name'], Included Columns --> (), index creation time: 0.304 seconds

Cost of query with index and no include columns: 0.0 seconds
All non-clustered indexes --> [('dbo', 'customer', 'IX_customer_c_name')]
All nonclustered indexes removed.
index_id: IXN_customer_c_name_c_ad_c_ph, index columns: ['c_name'], include columns: ('c_address', 'c_phone'), index size: 10.73 Mb
Created index --> [dbo].[customer].[IXN_customer_c_name_c_ad_c_ph], Indexed Columns --> ['c_name'], Included Columns --> ('c_address', 'c_phone'), index creation time: 0.519 seconds

Cost of query with index and include columns: 0.0 seconds
All non-clustered indexes --> [('dbo', 'customer', 'IXN_customer_c_name_c_ad_c_ph')]
All nonclustere