#### Index Creation Cost Model -  Online Ridge Regression

In [13]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm
import pickle
import numpy as np

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Generate all possible indexes for each table.

In [14]:
"""
    Function for generating all possible configurations (i.e. subsets of indixes)
"""
def generate_all_configurations(connection, MAX_COLS=2, with_includes=False, verbose=False):
    # first, generate all possible indices
    tables = get_all_tables(connection)
    all_indices = {} 
    # tqdm bar around table loop
    for table_name, table in tqdm(tables.items(), desc="Generating all indices for table:"):
        columns = table.get_columns()
        if verbose:
            print(f"Table --> {table_name} with columns --> {columns}")
        # get all possible permutations of columns, up to MAX_KEY_COLS columns
        for num_columns in range(1, MAX_COLS+1):
            col_permutations = list(itertools.permutations(columns, num_columns))
            # also generate permutations of columns with include columns
            for cp in col_permutations:
                if with_includes:
                    # get columns not in cp
                    include_columns = list(set(columns) - set(cp))
                    # get all comnbination of include columns on remaining columns
                    include_col_combinations = list(itertools.combinations(include_columns, MAX_COLS-num_columns))
                    for icp in include_col_combinations:
                        index_id = get_index_id(cp, table_name, include_columns)
                        if index_id not in all_indices:
                            index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                            # create index object
                            all_indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(icp))
                else:
                    index_id = get_index_id(cp, table_name)
                    if index_id not in all_indices:
                        index_size = get_estimated_index_size(connection, table_name, list(cp))
                        # create index object
                        all_indices[index_id] = Index(table_name, index_id, cp, index_size)

    print(f"Total number of indices generated: {len(all_indices)}, total estimated size: {sum([i.size for i in all_indices.values()]):.2f} Mb")
   
    return all_indices, tables


In [15]:
connection = start_connection()

all_indices, tables = generate_all_configurations(connection, MAX_COLS=3, with_includes=False)



# get all columns
all_columns, num_columns = get_all_columns(connection)

columns_to_idx = {}
i = 0
for table_name, columns in all_columns.items():
    for column in columns:
        columns_to_idx[column] = i
        i += 1

idx_to_columns = {v: k for k, v in columns_to_idx.items()}  

close_connection(connection)

Generating all indices for table:: 100%|██████████| 8/8 [00:00<00:00, 504.51it/s]

Total number of indices generated: 5585, total estimated size: 824332.61 Mb





#### Create feature vector for each index.

In [19]:
# show all
print(tables['customer'])

Table: customer, Row Count: 150000, PK Columns: ['c_custkey']


In [31]:
def get_table_stats(connection, table_name):
    cursor = connection.cursor()
    cursor.execute(f"""
                        SELECT SUM(row_count) as total_rows, SUM(used_page_count) * 8 / 1024.0 as size_mb
                        FROM sys.dm_db_partition_stats
                        WHERE object_id = OBJECT_ID('{table_name}')
                    """)
    row_count, table_size_mb = cursor.fetchone()
    table_size_mb = float(table_size_mb) if table_size_mb is not None else 0.0

    cursor.execute(f"""
            SELECT AVG(avg_fragmentation_in_percent)
            FROM sys.dm_db_index_physical_stats(DB_ID(), OBJECT_ID('{table_name}'), NULL, NULL, 'LIMITED')
        """)
    avg_fragmentation = cursor.fetchone()[0]
    avg_fragmentation = float(avg_fragmentation) if avg_fragmentation is not None else 0.0

    #cursor.execute("SELECT cpu_count FROM sys.dm_os_sys_info")
    #cpu_count = cursor.fetchone()[0]

    num_columns = len(all_columns[table_name])
    return (num_columns, row_count, table_size_mb, avg_fragmentation)
    

def column_feature_encoding(index, c=10):
    index_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
    # encoding for index columns
    for j, column_name in enumerate(index.index_columns):
        column_position_in_index = j
        index_columns_encoding[columns_to_idx[column_name]] = 1/(c**column_position_in_index)
    
    if len(index.include_columns) > 0:
        # encoding for include columns
        include_columns_encoding = np.zeros(len(columns_to_idx), dtype=float)
        for j, column_name in enumerate(index.include_columns):
            include_columns_encoding[columns_to_idx[column_name]] = 1

        # concatenate the two context vectors
        columns_encoding = np.hstack((index_columns_encoding, include_columns_encoding))
        return columns_encoding
    else:

        return index_columns_encoding 
    


def create_feature_vectors(all_indices, connection):

    print(f"Obtain table stats...")    
    table_stats = {}
    for index in all_indices.values():
        table_name = index.table_name
        if table_name not in table_stats:
            table_stats[table_name] = get_table_stats(connection, table_name)
    
    print(f"Creating feature vectors...")
    feature_vectors = {}
    for index in all_indices.values():
        columns_encoding = column_feature_encoding(index)
        index_size_mb = index.size
        num_columns, row_count, table_size_mb, avg_fragmentation = table_stats[index.table_name]
        table_features = np.array([num_columns, row_count, table_size_mb/index_size_mb, avg_fragmentation])
        feature_vectors[index.index_id] = np.concatenate((table_features, columns_encoding))
        
    return feature_vectors  



In [32]:
connection = start_connection()

feature_vectors = create_feature_vectors(all_indices, connection)

close_connection(connection)

Obtain table stats...
Creating feature vectors...
