#### Index Creation Cost Model -  Online Ridge Regression

In [1]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm
import pickle
import numpy as np

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

#### Generate all possible indexes for each table.

In [2]:
"""
    Function for generating all possible configurations (i.e. subsets of indixes)
"""
def generate_all_configurations(connection, MAX_COLS=3, with_includes=False, verbose=False):
    # first, generate all possible indices
    tables = get_all_tables(connection)
    all_indices = {} 
    # tqdm bar around table loop
    for table_name, table in tqdm(tables.items(), desc="Generating all indices for table:"):
        columns = table.get_columns()
        if verbose:
            print(f"Table --> {table_name} with columns --> {columns}")
        # get all possible permutations of columns, up to MAX_KEY_COLS columns
        for num_columns in range(1, MAX_COLS+1):
            col_permutations = list(itertools.permutations(columns, num_columns))
            # also generate permutations of columns with include columns
            for cp in col_permutations:
                if with_includes:
                    # get columns not in cp
                    include_columns = list(set(columns) - set(cp))
                    # get all comnbination of include columns on remaining columns
                    include_col_combinations = list(itertools.combinations(include_columns, MAX_COLS-num_columns))
                    for icp in include_col_combinations:
                        index_id = get_index_id(cp, table_name, include_columns)
                        if index_id not in all_indices:
                            index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                            # create index object
                            all_indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(icp))
                else:
                    index_id = get_index_id(cp, table_name)
                    if index_id not in all_indices:
                        index_size = get_estimated_index_size(connection, table_name, list(cp))
                        # create index object
                        all_indices[index_id] = Index(table_name, index_id, cp, index_size)

    print(f"Total number of indices generated: {len(all_indices)}, total estimated size: {sum([i.size for i in all_indices.values()]):.2f} Mb")
   
    return all_indices, tables

In [3]:
connection = start_connection()

all_indices, tables = generate_all_configurations(connection, MAX_COLS=4, with_includes=False)


# get all columns
all_columns, num_columns = get_all_columns(connection)

columns_to_idx = {}
i = 0
for table_name, columns in all_columns.items():
    for column in columns:
        columns_to_idx[column] = i
        i += 1

idx_to_columns = {v: k for k, v in columns_to_idx.items()}  

close_connection(connection)

Generating all indices for table:: 100%|██████████| 8/8 [00:00<00:00, 39.01it/s]

Total number of indices generated: 57977, total estimated size: 125146572.31 Mb





#### Create feature vector for each index:

Feature vector consists of two parts: 

    (1) Encoding of index and include columns
    (2) Table stats: column-count, row_count, row_count log(row_count), ratio of index to table size, average table fragmentation 

In [71]:
def get_table_stats(connection, table_name):
    cursor = connection.cursor()
    cursor.execute(f"""
                        SELECT SUM(row_count) as total_rows, SUM(used_page_count) * 8 / 1024.0 as size_mb
                        FROM sys.dm_db_partition_stats
                        WHERE object_id = OBJECT_ID('{table_name}')
                    """)
    row_count, table_size_mb = cursor.fetchone()
    table_size_mb = float(table_size_mb) if table_size_mb is not None else 0.0

    cursor.execute(f"""
            SELECT AVG(avg_fragmentation_in_percent)
            FROM sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID('{table_name}'), NULL, NULL, 'LIMITED')
        """)
    avg_fragmentation = cursor.fetchone()[0]
    avg_fragmentation = float(avg_fragmentation) if avg_fragmentation is not None else 0.0

    #cursor.execute("SELECT cpu_count FROM sys.dm_os_sys_info")
    #cpu_count = cursor.fetchone()[0]

    num_columns = len(all_columns[table_name])
    return (num_columns, row_count, table_size_mb, avg_fragmentation)
    

def column_feature_encoding(index, c):
    index_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
    # encoding for index columns
    for j, column_name in enumerate(index.index_columns):
        column_position_in_index = j
        index_columns_encoding[columns_to_idx[column_name]] = 1/(c**column_position_in_index)
    
    if len(index.include_columns) > 0:
        # encoding for include columns
        include_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
        for j, column_name in enumerate(index.include_columns):
            include_columns_encoding[columns_to_idx[column_name]] = 1

        # concatenate the two context vectors
        columns_encoding = np.hstack((index_columns_encoding, include_columns_encoding))
        return columns_encoding
    else:

        return index_columns_encoding 
    

def create_feature_vectors(all_indices, connection, c=10):

    print(f"Obtain table stats...")    
    table_stats = {}
    for index in all_indices.values():
        table_name = index.table_name
        if table_name not in table_stats:
            table_stats[table_name] = get_table_stats(connection, table_name)
    
    
    print(f"Creating feature vectors...")
    feature_vectors = {}
    for index in all_indices.values():
        index_size_mb = index.size
        num_table_columns, row_count, table_size_mb, avg_fragmentation = table_stats[index.table_name]
        num_index_columns = len(index.index_columns) 
        #table_features = np.array([row_count, row_count*np.log(row_count), num_index_columns, num_table_columns, index_size_mb, table_size_mb, index_size_mb/table_size_mb, avg_fragmentation])
        table_features = np.array([row_count*np.log(row_count), num_index_columns, num_table_columns, index_size_mb, table_size_mb, avg_fragmentation])
        #feature_vectors[index.index_id] = table_features
        
        columns_encoding = column_feature_encoding(index, c)
        feature_vectors[index.index_id] = np.concatenate((table_features, columns_encoding))
        
    return feature_vectors  


In [72]:
connection = start_connection()

feature_vectors = create_feature_vectors(all_indices, connection, c=10)

close_connection(connection)

Obtain table stats...
Creating feature vectors...


In [6]:
# group the indexes by table
indexes_by_table = defaultdict(list)
for index in all_indices.values():
    indexes_by_table[index.table_name].append(index)

for table_name, indexes in indexes_by_table.items():
    print(f"Table: {table_name}, Number of indexes: {len(indexes)}")

# for each table, gruop the indexes by number of columns
indexes_by_table_and_num_columns = defaultdict(dict)
for index in all_indices.values():
    indexes_by_table_and_num_columns[index.table_name][len(index.index_columns)] = indexes_by_table_and_num_columns[index.table_name].get(len(index.index_columns), []) + [index]

for table_name, indexes_by_num_columns in indexes_by_table_and_num_columns.items():
    print(f"Table: {table_name}")
    for num_columns, indexes in indexes_by_num_columns.items():
        print(f"Number of columns: {num_columns}, Number of indexes: {len(indexes)}")


Table: customer, Number of indexes: 2080
Table: orders, Number of indexes: 3609
Table: lineitem, Number of indexes: 47296
Table: part, Number of indexes: 3609
Table: supplier, Number of indexes: 1099
Table: partsupp, Number of indexes: 205
Table: nation, Number of indexes: 64
Table: region, Number of indexes: 15
Table: customer
Number of columns: 1, Number of indexes: 8
Number of columns: 2, Number of indexes: 56
Number of columns: 3, Number of indexes: 336
Number of columns: 4, Number of indexes: 1680
Table: orders
Number of columns: 1, Number of indexes: 9
Number of columns: 2, Number of indexes: 72
Number of columns: 3, Number of indexes: 504
Number of columns: 4, Number of indexes: 3024
Table: lineitem
Number of columns: 1, Number of indexes: 16
Number of columns: 2, Number of indexes: 240
Number of columns: 3, Number of indexes: 3360
Number of columns: 4, Number of indexes: 43680
Table: part
Number of columns: 1, Number of indexes: 9
Number of columns: 2, Number of indexes: 72
Num

In [7]:
# create train test splits for the indexes, from each table pick 10 indexes for training
train_indexes = []
test_indexes = []
split_idx = {1:8, 2:4, 3:2, 4:1}

for table_name, indexes_by_num_columns in indexes_by_table_and_num_columns.items():
    for num_columns, indexes in indexes_by_num_columns.items():
        random.shuffle(indexes)
        train_indexes.extend(indexes[:split_idx[num_columns]])
        test_indexes.extend(indexes[split_idx[num_columns]:])


"""
for table_name, indexes in indexes_by_table.items():
    random.shuffle(indexes)
    train_indexes.extend(indexes[:split_idx])
    test_indexes.extend(indexes[split_idx:])
"""
random.shuffle(train_indexes) 
random.shuffle(test_indexes)

print(f"Number of training indexes: {len(train_indexes)}, number of test indexes: {len(test_indexes)}")

Number of training indexes: 106, number of test indexes: 57871


In [8]:

# measure actual creation times of training indexes
connection = start_connection()

remove_all_nonclustered_indexes(connection)

index_creation_cost = {}
for index in tqdm(train_indexes, desc="Measuring index creation cost"):
    index_creation_cost[index.index_id] = create_nonclustered_index_object(connection, index)
    drop_noncluster_index_object(connection, index)

close_connection(connection)


All 0 nonclustered indexes removed.


Measuring index creation cost:: 100%|██████████| 106/106 [1:03:49<00:00, 36.12s/it]


#### Online Ridge Regression Model

In [142]:
#from sklearn.preprocessing import StandardScaler

class Model:
    def __init__(self, feature_vectors, all_indices, lambda_reg=0.1, epsilon=1e-8):
        self.feature_vectors = feature_vectors
        self.all_indices = all_indices
        self.feature_dims = feature_vectors[list(feature_vectors.keys())[0]].shape[0]
        self.V = lambda_reg * np.eye(self.feature_dims)
        self.b = np.zeros(self.feature_dims)
        self.theta = np.zeros(self.feature_dims)
        self.lambda_reg = lambda_reg
        self.epsilon = epsilon
        self.loss_history = []
        #self.scaler = StandardScaler()
        #self.normalize_features()

    #def normalize_features(self):
    #    all_features = np.array(list(self.feature_vectors.values()))
    #    self.scaler.fit(all_features)
    #    for key in self.feature_vectors:
    #        self.feature_vectors[key] = self.scaler.transform([self.feature_vectors[key]])[0]

    def update(self, index, cost, verbose=False):
        x = self.feature_vectors[index.index_id]    
        y = cost
        self.V += np.outer(x, x)
        self.b += y * x
        # add small epsilon to diagonal of V for conditioning
        #self.theta = np.linalg.solve(self.V + self.epsilon*np.eye(self.feature_dims), self.b)
        self.theta = np.linalg.solve(self.V, self.b)
        loss, y_pred = self.compute_loss(x, y)
        if verbose:
            print(f"Update for index: {index.index_id}, actual cost: {cost}, predicted cost: {y_pred:.3f}, loss incurred: {loss}")

    def predict(self, x):
        y_pred = max(0, np.dot(self.theta, x))
        return y_pred

    def compute_loss(self, x, y):
        y_pred = self.predict(x)
        mse = (y - y_pred)**2
        reg = self.lambda_reg * np.dot(self.theta, self.theta)
        loss = mse + reg
        self.loss_history.append(loss)
        return loss, y_pred    
        

#### Train the model

In [150]:
model = Model(feature_vectors, all_indices, lambda_reg=0.3)

In [151]:
connection = start_connection()

#index_creation_cost = {}
for index in tqdm(train_indexes, desc="Update step:"):
    #cost = create_nonclustered_index_object(connection, index)
    #index_creation_cost[index.index_id] = cost
    #drop_noncluster_index_object(connection, index)
    
    cost = index_creation_cost[index.index_id]
    model.update(index, cost, verbose=True)  

close_connection(connection)

Update step:: 100%|██████████| 106/106 [00:00<00:00, 27306.00it/s]

Update for index: IX_lineitem_l_tax_l_suppkey, actual cost: 181.741, predicted cost: 181.741, loss incurred: 8.585452097000329e-15
Update for index: IX_orders_o_clerk, actual cost: 50.787, predicted cost: 50.787, loss incurred: 0.0010132232991589445
Update for index: IX_nation_n_name, actual cost: 0.0, predicted cost: 0.000, loss incurred: 0.0010141193996218536
Update for index: IX_nation_n_comment, actual cost: 0.0, predicted cost: 0.000, loss incurred: 0.0010141490685663824
Update for index: IX_part_p_container, actual cost: 4.25, predicted cost: 4.173, loss incurred: 0.23569508312346144
Update for index: IX_lineitem_l_comment_l_returnflag, actual cost: 264.728, predicted cost: 264.728, loss incurred: 0.3677798989247556
Update for index: IX_supplier_s_phone, actual cost: 0.301, predicted cost: 0.514, loss incurred: 0.6206971608631267
Update for index: IX_orders_o_orderpriority_o_shippriority_o_orderstatus, actual cost: 48.401, predicted cost: 48.760, loss incurred: 2.6759669201412994




In [146]:
# save the trained model and index_creation_cost to pickle files
#with open('model.pkl', 'wb') as f:
#    pickle.dump(model, f)

#with open('index_creation_cost.pkl', 'wb') as f:
#    pickle.dump(index_creation_cost, f)


# load the trained model and index_creation_cost from pickle files
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
 
with open('index_creation_cost.pkl', 'rb') as f:
   index_creation_cost = pickle.load(f)    

In [155]:
# evaluate the model on the training set
for index in train_indexes:
    cost = index_creation_cost[index.index_id]
    x = model.feature_vectors[index.index_id]
    y = index_creation_cost[index.index_id]
    loss, y_pred = model.compute_loss(x, y)
    print(f"Index: {index.index_id}, actual cost: {y:.3f}, predicted cost: {y_pred:.3f}, loss: {loss:.3f}")
    

Index: IX_lineitem_l_tax_l_suppkey, actual cost: 181.741, predicted cost: 152.273, loss: 15325.171
Index: IX_orders_o_clerk, actual cost: 50.787, predicted cost: 56.277, loss: 14486.974
Index: IX_nation_n_name, actual cost: 0.000, predicted cost: 2.004, loss: 14460.848
Index: IX_nation_n_comment, actual cost: 0.000, predicted cost: 1.915, loss: 14460.501
Index: IX_part_p_container, actual cost: 4.250, predicted cost: 4.064, loss: 14456.867
Index: IX_lineitem_l_comment_l_returnflag, actual cost: 264.728, predicted cost: 258.509, loss: 14495.509
Index: IX_supplier_s_phone, actual cost: 0.301, predicted cost: 1.883, loss: 14459.335
Index: IX_orders_o_orderpriority_o_shippriority_o_orderstatus, actual cost: 48.401, predicted cost: 46.589, loss: 14460.114
Index: IX_supplier_s_suppkey_s_acctbal_s_nationkey, actual cost: 0.113, predicted cost: 0.000, loss: 14456.845
Index: IX_supplier_s_name, actual cost: 0.265, predicted cost: 1.953, loss: 14459.680
Index: IX_orders_o_orderdate_o_orderstatus

#### Make predictions for some test indexes.

In [137]:
# pick 2 random test indexes for each table
test_indexes_sample = []
for table_name, indexes in indexes_by_table.items():
    # exclude indexes that were used in training
    indexes_filtered = [index for index in indexes if index not in train_indexes]
    random.shuffle(indexes_filtered)
    test_indexes_sample.extend(indexes_filtered[:3])

for index in test_indexes_sample:
    print(f"Index: {index.index_id}, index columns: {index.index_columns}")

Index: IX_customer_c_phone_c_mktsegment_c_custkey_c_nationkey, index columns: ('c_phone', 'c_mktsegment', 'c_custkey', 'c_nationkey')
Index: IX_customer_c_mktsegment_c_name_c_acctbal_c_phone, index columns: ('c_mktsegment', 'c_name', 'c_acctbal', 'c_phone')
Index: IX_customer_c_custkey_c_address_c_comment_c_acctbal, index columns: ('c_custkey', 'c_address', 'c_comment', 'c_acctbal')
Index: IX_orders_o_shippriority_o_custkey_o_totalprice_o_clerk, index columns: ('o_shippriority', 'o_custkey', 'o_totalprice', 'o_clerk')
Index: IX_orders_o_totalprice_o_orderpriority_o_clerk_o_custkey, index columns: ('o_totalprice', 'o_orderpriority', 'o_clerk', 'o_custkey')
Index: IX_orders_o_comment_o_orderpriority_o_clerk_o_shippriority, index columns: ('o_comment', 'o_orderpriority', 'o_clerk', 'o_shippriority')
Index: IX_lineitem_l_shipmode_l_tax_l_returnflag_l_discount, index columns: ('l_shipmode', 'l_tax', 'l_returnflag', 'l_discount')
Index: IX_lineitem_l_discount_l_commitdate_l_receiptdate_l_lin

In [140]:
# make predictions for some test indexes and compare with actual costs
connection = start_connection()

remove_all_nonclustered_indexes(connection)

test_index_creation_cost = {}
for index in tqdm(test_indexes_sample, desc="Measuring index creation cost:"):
    test_index_creation_cost[index.index_id] = create_nonclustered_index_object(connection, index)
    drop_noncluster_index_object(connection, index)

close_connection(connection)

All 0 nonclustered indexes removed.


Measuring index creation cost:: 100%|██████████| 24/24 [16:52<00:00, 42.21s/it] 


In [148]:
# save test index creation costs to pickle file
#with open('test_index_creation_cost.pkl', 'wb') as f:
#    pickle.dump(test_index_creation_cost, f)


In [154]:
test_losses = []
for index_id in test_index_creation_cost:
    x = model.feature_vectors[index_id]
    y = test_index_creation_cost[index_id]
    loss, y_pred = model.compute_loss(x, y)
    test_losses.append(loss)
    print(f"Index: {index_id}, actual cost: {y:.3f}, predicted cost: {y_pred:.3f}, loss: {loss:.3f}")


Index: IX_lineitem_l_shipmode_l_tax_l_shipinstruct_l_linestatus, actual cost: 331.400, predicted cost: 287.372, loss: 16395.289
Index: IX_lineitem_l_linenumber_l_discount_l_returnflag_l_comment, actual cost: 269.460, predicted cost: 266.468, loss: 14465.783
Index: IX_lineitem_l_shipmode_l_linestatus_l_orderkey_l_shipinstruct, actual cost: 260.106, predicted cost: 255.374, loss: 14479.223
Index: IX_orders_o_orderstatus_o_orderdate_o_orderkey_o_comment, actual cost: 62.328, predicted cost: 62.741, loss: 14457.003
Index: IX_customer_c_acctbal_c_phone_c_comment_c_custkey, actual cost: 9.133, predicted cost: 9.045, loss: 14456.840
Index: IX_part_p_brand_p_retailprice_p_comment, actual cost: 8.959, predicted cost: 5.815, loss: 14466.717
Index: IX_part_p_partkey_p_retailprice_p_type_p_container, actual cost: 5.834, predicted cost: 4.054, loss: 14460.001
Index: IX_lineitem_l_shipmode_l_shipinstruct_l_partkey_l_quantity, actual cost: 326.493, predicted cost: 292.624, loss: 15603.966
Index: IX_l

In [153]:
for index_id in test_index_creation_cost:
    index = all_indices[index_id]
    cost = test_index_creation_cost[index_id]
    model.update(index, cost, verbose=True)  

Update for index: IX_lineitem_l_shipmode_l_tax_l_shipinstruct_l_linestatus, actual cost: 331.4, predicted cost: 298.664, loss incurred: 13874.36529263613
Update for index: IX_lineitem_l_linenumber_l_discount_l_returnflag_l_comment, actual cost: 269.46, predicted cost: 271.416, loss incurred: 12845.640091649868
Update for index: IX_lineitem_l_shipmode_l_linestatus_l_orderkey_l_shipinstruct, actual cost: 260.106, predicted cost: 260.901, loss incurred: 12844.227116464752
Update for index: IX_orders_o_orderstatus_o_orderdate_o_orderkey_o_comment, actual cost: 62.328, predicted cost: 66.021, loss incurred: 12896.087664158993
Update for index: IX_customer_c_acctbal_c_phone_c_comment_c_custkey, actual cost: 9.133, predicted cost: 9.438, loss incurred: 12882.909036535366
Update for index: IX_part_p_brand_p_retailprice_p_comment, actual cost: 8.959, predicted cost: 5.793, loss incurred: 12892.183017545238
Update for index: IX_part_p_partkey_p_retailprice_p_type_p_container, actual cost: 5.834,