#### Index Creation Cost Model -  Online Ridge Regression

In [2]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm
import pickle
import numpy as np

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Generate all possible indexes for each table.

In [3]:
"""
    Function for generating all possible configurations (i.e. subsets of indixes)
"""
def generate_all_configurations(connection, MAX_COLS=3, with_includes=False, verbose=False):
    # first, generate all possible indices
    tables = get_all_tables(connection)
    all_indices = {} 
    # tqdm bar around table loop
    for table_name, table in tqdm(tables.items(), desc="Generating all indices for table:"):
        columns = table.get_columns()
        if verbose:
            print(f"Table --> {table_name} with columns --> {columns}")
        # get all possible permutations of columns, up to MAX_KEY_COLS columns
        for num_columns in range(1, MAX_COLS+1):
            col_permutations = list(itertools.permutations(columns, num_columns))
            # also generate permutations of columns with include columns
            for cp in col_permutations:
                if with_includes:
                    # get columns not in cp
                    include_columns = list(set(columns) - set(cp))
                    # get all comnbination of include columns on remaining columns
                    include_col_combinations = list(itertools.combinations(include_columns, MAX_COLS-num_columns))
                    for icp in include_col_combinations:
                        index_id = get_index_id(cp, table_name, include_columns)
                        if index_id not in all_indices:
                            index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                            # create index object
                            all_indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(icp))
                else:
                    index_id = get_index_id(cp, table_name)
                    if index_id not in all_indices:
                        index_size = get_estimated_index_size(connection, table_name, list(cp))
                        # create index object
                        all_indices[index_id] = Index(table_name, index_id, cp, index_size)

    print(f"Total number of indices generated: {len(all_indices)}, total estimated size: {sum([i.size for i in all_indices.values()]):.2f} Mb")
   
    return all_indices, tables


In [4]:
connection = start_connection()

all_indices, tables = generate_all_configurations(connection, MAX_COLS=4, with_includes=False)


# get all columns
all_columns, num_columns = get_all_columns(connection)

columns_to_idx = {}
i = 0
for table_name, columns in all_columns.items():
    for column in columns:
        columns_to_idx[column] = i
        i += 1

idx_to_columns = {v: k for k, v in columns_to_idx.items()}  

close_connection(connection)

Generating all indices for table:: 100%|██████████| 8/8 [00:00<00:00, 39.68it/s]

Total number of indices generated: 57977, total estimated size: 125146572.31 Mb





#### Create feature vector for each index:

Feature vector consists of two parts: 

    (1) Encoding of index and include columns
    (2) Table stats: column-count, row_count, row_count log(row_count), ratio of index to table size, average table fragmentation 

In [24]:
def get_table_stats(connection, table_name):
    cursor = connection.cursor()
    cursor.execute(f"""
                        SELECT SUM(row_count) as total_rows, SUM(used_page_count) * 8 / 1024.0 as size_mb
                        FROM sys.dm_db_partition_stats
                        WHERE object_id = OBJECT_ID('{table_name}')
                    """)
    row_count, table_size_mb = cursor.fetchone()
    table_size_mb = float(table_size_mb) if table_size_mb is not None else 0.0

    cursor.execute(f"""
            SELECT AVG(avg_fragmentation_in_percent)
            FROM sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID('{table_name}'), NULL, NULL, 'LIMITED')
        """)
    avg_fragmentation = cursor.fetchone()[0]
    avg_fragmentation = float(avg_fragmentation) if avg_fragmentation is not None else 0.0

    #cursor.execute("SELECT cpu_count FROM sys.dm_os_sys_info")
    #cpu_count = cursor.fetchone()[0]

    num_columns = len(all_columns[table_name])
    return (num_columns, row_count, table_size_mb, avg_fragmentation)
    

def column_feature_encoding(index, c):
    index_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
    # encoding for index columns
    for j, column_name in enumerate(index.index_columns):
        column_position_in_index = j
        index_columns_encoding[columns_to_idx[column_name]] = 1/(c**column_position_in_index)
    
    if len(index.include_columns) > 0:
        # encoding for include columns
        include_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
        for j, column_name in enumerate(index.include_columns):
            include_columns_encoding[columns_to_idx[column_name]] = 1

        # concatenate the two context vectors
        columns_encoding = np.hstack((index_columns_encoding, include_columns_encoding))
        return columns_encoding
    else:

        return index_columns_encoding 
    

def create_feature_vectors(all_indices, connection, c=10):

    print(f"Obtain table stats...")    
    table_stats = {}
    for index in all_indices.values():
        table_name = index.table_name
        if table_name not in table_stats:
            table_stats[table_name] = get_table_stats(connection, table_name)
    
    
    print(f"Creating feature vectors...")
    feature_vectors = {}
    for index in all_indices.values():
        columns_encoding = column_feature_encoding(index, c)
        index_size_mb = index.size
        num_columns, row_count, table_size_mb, avg_fragmentation = table_stats[index.table_name]
        table_features = np.array([row_count, row_count*np.log(row_count), num_columns, index_size_mb/table_size_mb, avg_fragmentation])
        feature_vectors[index.index_id] = np.concatenate((table_features, columns_encoding))
        
    return feature_vectors  


In [25]:
connection = start_connection()

feature_vectors = create_feature_vectors(all_indices, connection, c=2)

close_connection(connection)

Obtain table stats...
Creating feature vectors...


In [8]:
# group the indexes by table
indexes_by_table = defaultdict(list)
for index in all_indices.values():
    indexes_by_table[index.table_name].append(index)


# create train test splits for the indexes, from each table pick 10 indexes for training
train_indexes = []
test_indexes = []
split_idx = 20
for table_name, indexes in indexes_by_table.items():
    random.shuffle(indexes)
    train_indexes.extend(indexes[:split_idx])
    test_indexes.extend(indexes[split_idx:])

random.shuffle(train_indexes) 
random.shuffle(test_indexes)

print(f"Number of training indexes: {len(train_indexes)}, number of test indexes: {len(test_indexes)}")

Number of training indexes: 155, number of test indexes: 57822


In [7]:
"""
# measure actual creation times of training indexes
connection = start_connection()

index_creation_cost = {}
for index in tqdm(train_indexes, desc="Measuring index creation cost:"):
    index_creation_cost[index.index_id] = create_nonclustered_index_object(connection, index)
    drop_noncluster_index_object(connection, index)

close_connection(connection)
"""

Measuring index creation cost::   0%|          | 0/155 [00:00<?, ?it/s]

Measuring index creation cost:: 100%|██████████| 155/155 [09:56<00:00,  3.85s/it]


#### Online Ridge Regression Model

In [26]:
#from sklearn.preprocessing import StandardScaler

class Model:
    def __init__(self, feature_vectors, all_indices, lambda_reg=0.1, epsilon=1e-8):
        self.feature_vectors = feature_vectors
        self.all_indices = all_indices
        self.feature_dims = feature_vectors[list(feature_vectors.keys())[0]].shape[0]
        self.V = lambda_reg * np.eye(self.feature_dims)
        self.b = np.zeros(self.feature_dims)
        self.theta = np.zeros(self.feature_dims)
        self.lambda_reg = lambda_reg
        self.epsilon = epsilon
        self.loss_history = []
        #self.scaler = StandardScaler()
        #self.normalize_features()

    #def normalize_features(self):
    #    all_features = np.array(list(self.feature_vectors.values()))
    #    self.scaler.fit(all_features)
    #    for key in self.feature_vectors:
    #        self.feature_vectors[key] = self.scaler.transform([self.feature_vectors[key]])[0]

    def update(self, index, cost, verbose=False):
        x = self.feature_vectors[index.index_id]    
        y = cost
        self.V += np.outer(x, x)
        self.b += y * x
        # add small epsilon to diagonal of V for conditioning
        self.theta = np.linalg.solve(self.V + self.epsilon*np.eye(self.feature_dims), self.b)
        loss, y_pred = self.compute_loss(x, y)
        if verbose:
            print(f"Update for index: {index.index_id}, actual cost: {cost}, predicted cost: {y_pred:.3f}, loss incurred: {loss}")

    def predict(self, x):
        return np.dot(self.theta, x)

    def compute_loss(self, x, y):
        y_pred = self.predict(x)
        mse = (y - y_pred)**2
        reg = self.lambda_reg * np.dot(self.theta, self.theta)
        loss = mse + reg
        self.loss_history.append(loss)
        return loss, y_pred    
        

#### Train the model

In [28]:
model = Model(feature_vectors, all_indices, lambda_reg=0.01)

In [29]:
connection = start_connection()

#index_creation_cost = {}
for index in tqdm(train_indexes, desc="Update step:"):
    #cost = create_nonclustered_index_object(connection, index)
    #index_creation_cost[index.index_id] = cost
    #drop_noncluster_index_object(connection, index)
    
    cost = index_creation_cost[index.index_id]
    model.update(index, cost, verbose=True)  

close_connection(connection)

Update step:: 100%|██████████| 155/155 [00:00<00:00, 14151.75it/s]

Update for index: IX_nation_n_name_n_nationkey_n_regionkey_n_comment, actual cost: 0.003, predicted cost: 0.003, loss incurred: 1.3379563729456233e-10
Update for index: IX_nation_n_nationkey_n_name_n_comment, actual cost: 0.001, predicted cost: 0.001, loss incurred: 6.243372767953436e-08
Update for index: IX_nation_n_nationkey_n_regionkey_n_comment, actual cost: 0.001, predicted cost: 0.001, loss incurred: 6.67973479198802e-08
Update for index: IX_orders_o_orderpriority_o_shippriority_o_orderkey_o_totalprice, actual cost: 42.696, predicted cost: 42.696, loss incurred: 6.79269600415264e-08
Update for index: IX_part_p_size_p_partkey_p_brand_p_mfgr, actual cost: 2.39, predicted cost: 2.392, loss incurred: 0.006133436987082972
Update for index: IX_orders_o_orderpriority_o_orderstatus_o_orderkey, actual cost: 58.229, predicted cost: 57.986, loss incurred: 3.6265377348115875
Update for index: IX_orders_o_orderstatus_o_clerk_o_orderdate_o_shippriority, actual cost: 109.002, predicted cost: 10




In [33]:
model.theta

array([-1.73468369e-01,  2.76803497e-06, -4.03269614e-03,  3.58760797e-01,
       -4.27611631e-01,  6.01552409e-01,  2.75417817e+00, -5.14255722e-01,
       -5.62274485e-01,  1.22893127e+00,  2.95039355e-01,  1.82161976e+00,
        1.89698809e+01,  1.57692618e+01, -1.58712901e+01,  2.70196853e+01,
       -5.22789252e+01,  1.94212716e+01,  4.72628925e+01,  3.95143512e+01,
       -1.66965996e+01,  5.04230850e+01, -4.70750477e+01,  1.08312105e+02,
        2.81590208e+00,  3.18922953e+01,  1.67195731e+02, -1.03062919e+02,
        1.58172554e+02, -6.29848832e+01, -9.59775068e+00,  2.39230342e+02,
       -1.37446283e+01,  6.16808706e+01,  5.12606256e+01, -7.88247494e+01,
       -2.15780475e+01,  2.90235997e+00,  8.55043119e-01,  1.48706116e+00,
        6.07300269e+00,  2.19323415e+00, -1.74145465e+00, -4.73164228e-01,
        9.83171746e-02,  1.28436359e+00,  6.76253058e-01,  7.34296486e-01,
        8.99537868e-01,  5.68492811e-01,  6.78309245e-01,  8.25319516e-01,
        6.33230493e-01,  

In [30]:
# save the trained model and index_creation_cost to pickle files
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('index_creation_cost.pkl', 'wb') as f:
    pickle.dump(index_creation_cost, f)


# load the trained model and index_creation_cost from pickle files
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)
 
with open('index_creation_cost.pkl', 'rb') as f:
   index_creation_cost = pickle.load(f)    

In [36]:
# evaluate the model on the training set
for index in train_indexes:
    cost = index_creation_cost[index.index_id]
    x = model.feature_vectors[index.index_id]
    y = index_creation_cost[index.index_id]
    loss, y_pred = model.compute_loss(x, y)
    print(f"Index: {index.index_id}, actual cost: {y:.3f}, predicted cost: {y_pred:.3f}, loss: {loss:.3f}")
    

Index: IX_nation_n_name_n_nationkey_n_regionkey_n_comment, actual cost: 0.003, predicted cost: 0.013, loss: 1937.490
Index: IX_nation_n_nationkey_n_name_n_comment, actual cost: 0.001, predicted cost: 0.014, loss: 1937.490
Index: IX_nation_n_nationkey_n_regionkey_n_comment, actual cost: 0.001, predicted cost: 0.030, loss: 1937.491
Index: IX_orders_o_orderpriority_o_shippriority_o_orderkey_o_totalprice, actual cost: 42.696, predicted cost: 64.013, loss: 2391.895
Index: IX_part_p_size_p_partkey_p_brand_p_mfgr, actual cost: 2.390, predicted cost: 4.648, loss: 1942.590
Index: IX_orders_o_orderpriority_o_orderstatus_o_orderkey, actual cost: 58.229, predicted cost: 69.796, loss: 2071.281
Index: IX_orders_o_orderstatus_o_clerk_o_orderdate_o_shippriority, actual cost: 109.002, predicted cost: 108.452, loss: 1937.792
Index: IX_part_p_name_p_brand_p_type_p_container, actual cost: 6.547, predicted cost: 8.102, loss: 1939.907
Index: IX_lineitem_l_quantity_l_returnflag_l_receiptdate_l_extendedprice,

#### Make predictions for some test indexes.

In [62]:
# make predictions for some test indexes and compare with actual costs
connection = start_connection()

remove_all_nonclustered_indexes(connection)

test_indexes_sample = random.sample(test_indexes, 10)
test_index_creation_cost = {}
for index in tqdm(test_indexes_sample, desc="Measuring index creation cost:"):
    test_index_creation_cost[index.index_id] = create_nonclustered_index_object(connection, index)
    drop_noncluster_index_object(connection, index)

close_connection(connection)

All 0 nonclustered indexes removed.


Measuring index creation cost::  20%|██        | 2/10 [00:11<00:39,  5.00s/it]

In [None]:
test_losses = []
for index in test_indexes_sample:
    x = model.feature_vectors[index.index_id]
    y = test_index_creation_cost[index.index_id]
    loss, y_pred = model.compute_loss(x, y)
    test_losses.append(loss)
    print(f"Index: {index.index_id}, actual cost: {y:.3f}, predicted cost: {y_pred:.3f}, loss: {loss:.3f}")


Index: IX_lineitem_l_comment_l_receiptdate_l_linenumber, actual cost: 274.219, predicted cost: 286.416, loss: 2086.250
Index: IX_lineitem_l_shipdate_l_shipinstruct_l_linestatus_l_comment, actual cost: 346.617, predicted cost: 305.457, loss: 3631.622
Index: IX_lineitem_l_commitdate_l_extendedprice_l_comment_l_shipinstruct, actual cost: 146.592, predicted cost: 149.150, loss: 1944.033
Index: IX_lineitem_l_orderkey_l_linestatus_l_returnflag_l_receiptdate, actual cost: 108.434, predicted cost: 141.336, loss: 3020.012
Index: IX_lineitem_l_returnflag_l_shipinstruct_l_comment_l_shipdate, actual cost: 663.585, predicted cost: 519.633, loss: 22659.682
Index: IX_lineitem_l_returnflag_l_commitdate_l_shipdate_l_linestatus, actual cost: 291.688, predicted cost: 408.001, loss: 15466.309
Index: IX_lineitem_l_tax_l_shipmode_l_orderkey_l_comment, actual cost: 195.396, predicted cost: 163.028, loss: 2985.174
Index: IX_lineitem_l_comment_l_receiptdate_l_orderkey_l_linestatus, actual cost: 218.527, predic

In [34]:
for index in test_indexes_sample:
    cost = test_index_creation_cost[index.index_id]
    model.update(index, cost, verbose=True)  

Update for index: IX_lineitem_l_comment_l_receiptdate_l_linenumber, actual cost: 274.219, predicted cost: 269.908, loss incurred: 1525.446012703173
Update for index: IX_lineitem_l_shipdate_l_shipinstruct_l_linestatus_l_comment, actual cost: 346.617, predicted cost: 315.305, loss incurred: 2921.7615791977755
Update for index: IX_lineitem_l_commitdate_l_extendedprice_l_comment_l_shipinstruct, actual cost: 146.592, predicted cost: 152.170, loss incurred: 1973.3132510139199
Update for index: IX_lineitem_l_orderkey_l_linestatus_l_returnflag_l_receiptdate, actual cost: 108.434, predicted cost: 146.769, loss incurred: 3067.965257182248
Update for index: IX_lineitem_l_returnflag_l_shipinstruct_l_comment_l_shipdate, actual cost: 663.585, predicted cost: 591.249, loss incurred: 8147.090937752186
Update for index: IX_lineitem_l_returnflag_l_commitdate_l_shipdate_l_linestatus, actual cost: 291.688, predicted cost: 405.763, loss incurred: 14996.917072220216
Update for index: IX_lineitem_l_tax_l_shi