#### MAB Index Selection (v2)

In [29]:
%load_ext autoreload
%autoreload 2

import os, sys
import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'PostgreSQL'))
sys.path.append(target_subdirectory_path)

from pg_utils import *
from ssb_qgen_class import *

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
tables, pk_columns = get_ssb_schema()
print(tables)
print(pk_columns)

{'lineorder': [('lo_orderkey', 'INT'), ('lo_linenumber', 'INT'), ('lo_custkey', 'INT'), ('lo_partkey', 'INT'), ('lo_suppkey', 'INT'), ('lo_orderdate', 'DATE'), ('lo_orderpriority', 'CHAR(15)'), ('lo_shippriority', 'CHAR(1)'), ('lo_quantity', 'INT'), ('lo_extendedprice', 'DECIMAL(18,2)'), ('lo_ordtotalprice', 'DECIMAL(18,2)'), ('lo_discount', 'DECIMAL(18,2)'), ('lo_revenue', 'DECIMAL(18,2)'), ('lo_supplycost', 'DECIMAL(18,2)'), ('lo_tax', 'INT'), ('lo_commitdate', 'DATE'), ('lo_shipmode', 'CHAR(10)')], 'part': [('p_partkey', 'INT'), ('p_name', 'VARCHAR(22)'), ('p_mfgr', 'CHAR(6)'), ('p_category', 'CHAR(7)'), ('p_brand', 'CHAR(9)'), ('p_color', 'VARCHAR(11)'), ('p_type', 'VARCHAR(25)'), ('p_size', 'INT'), ('p_container', 'CHAR(15)')], 'supplier': [('s_suppkey', 'INT'), ('s_name', 'CHAR(25)'), ('s_address', 'VARCHAR(25)'), ('s_city', 'CHAR(10)'), ('s_nation', 'CHAR(15)'), ('s_region', 'CHAR(12)'), ('s_phone', 'CHAR(20)')], 'customer': [('c_custkey', 'INT'), ('c_name', 'VARCHAR(25)'), ('c_

In [25]:
# get table size, row count and column info
def get_table_and_column_details():
    ssb_schema, pk_columns = get_ssb_schema()
    columns = {}
    tables = {}
    for table_name in ssb_schema:
        conn = create_connection()
        table_info = get_table_size_and_row_count(conn, table_name)
        print(f"Table name : {table_name}, table info : {table_info}")
        tables[table_name] =  {**table_info, **{"pk_columns": pk_columns[table_name]}}
        close_connection(conn)
        columns[table_name] = [c[0] for c in ssb_schema[table_name]]  
         
    return tables, columns

In [26]:
tables, columns = get_table_and_column_details()
print(tables)
print(columns)

Table name : lineorder, table info : {'size': 8969, 'row_count': 59986214}
Table name : part, table info : {'size': 112, 'row_count': 800000}
Table name : supplier, table info : {'size': 3, 'row_count': 20000}
Table name : customer, table info : {'size': 47, 'row_count': 300000}
Table name : dwdate, table info : {'size': 0, 'row_count': 2556}
{'lineorder': {'size': 8969, 'row_count': 59986214, 'pk_columns': ['lo_orderkey', 'lo_linenumber']}, 'part': {'size': 112, 'row_count': 800000, 'pk_columns': ['p_partkey']}, 'supplier': {'size': 3, 'row_count': 20000, 'pk_columns': ['s_suppkey']}, 'customer': {'size': 47, 'row_count': 300000, 'pk_columns': ['c_custkey']}, 'dwdate': {'size': 0, 'row_count': 2556, 'pk_columns': ['d_datekey']}}
{'lineorder': ['lo_orderkey', 'lo_linenumber', 'lo_custkey', 'lo_partkey', 'lo_suppkey', 'lo_orderdate', 'lo_orderpriority', 'lo_shippriority', 'lo_quantity', 'lo_extendedprice', 'lo_ordtotalprice', 'lo_discount', 'lo_revenue', 'lo_supplycost', 'lo_tax', 'lo_c

In [28]:


class MAB:

    def __init__(self, alpha=1.0, vlambda=0.5, alpha_decay_rate=1.0, config_memory_MB=128, qoi_memory=5):
        # define Lin UCB parameters
        self.alpha = alpha     # UCB exploration parameter
        self.vlambda = vlambda # regularization parameter
        self.alpha_decay_rate = alpha_decay_rate  # decay rate for alpha
        self.config_memory_MB = config_memory_MB  # memory budget for storing indexes
        self.qoi_memory = qoi_memory  # how far back to look for queries of interest (QoIs)

        # get all columns
        self.tables, self.all_columns = get_table_and_column_details()
        self.num_columns = sum([len(columns) for columns in self.all_columns.values()])
        # drop all non clustered indices
        conn = create_connection()
        drop_all_indexes(conn)
        close_connection(conn)

        # get database size
        conn = create_connection()
        self.database_size = get_database_size(conn)
        close_connection(conn)

        # context vector dims  
        self.context_size = self.num_columns + self.num_columns + 2  # index_columns + include_columns + derived_context

        # create a mapping from column name to integer 
        self.columns_to_idx = {}
        i = 0
        for table_name, columns in self.all_columns.items():
            for column in columns:
                self.columns_to_idx[column] = i
                i += 1

        self.idx_to_columns = {v: k for k, v in self.columns_to_idx.items()}   

        # initialize matrix V and vector b
        self.V = np.identity(self.context_size) * self.vlambda
        self.b = np.zeros(shape=(self.context_size, 1))
        self.context_vectors = None
        self.upper_bounds  = None
        self.index_selection_count = defaultdict(int)
        self.query_store = {}
        self.selected_indices_last_round = {}
        self.table_scan_times = defaultdict(list)
        self.index_average_reward = defaultdict(float)
    
        # initialize query store
        self.query_store = {}
        # track current round    
        self.current_round = 0

        # create a cache for column context vectors
        self.column_context_cache = {}

        # cache for storing index stats
        self.index_average_reward = defaultdict(float)
        self.index_size = {}

        # keep copy of indexes selected in previous round
        self.selected_indexes_last_round = {}

        # constants
        self.MAX_INDEX_COLUMNS = 3
        self.MAX_INCLUDE_COLUMNS = 4 
        self.SMALL_TABLE_IGNORE = 1000
        self.TABLE_MIN_SELECTIVITY = 0.5
        self.MAX_INDEXES_PER_TABLE = 3
        self.TABLE_SCAN_TIME_LENGTH = 1000
        self.INCLUDE_COLS = False

        
    # step through a round of the MAB
    def step_round(self, mini_workload, verbose=False):
        self.current_round += 1

        # identify new query templates from the mini workload and update stats    
        if verbose: print(f"Identifying new query templates from the mini workload...")
        self.identify_new_query_templates(mini_workload)

        # select queries of interest (QoIs) from past workload and use them to extract candidate indexes, i.e. bandit arms
        if verbose: print(f"Selecting QoIs and extracting candidate indexes...")
        QoIs = self.select_queries_of_interest(mini_workload)
        candidate_indexes = self.extract_candidate_indexes(QoIs)

        # generate context vectors for each candidate index
        if verbose: print(f"Generating context vectors...")
        self.context_vectors = self.generate_context_vectors(candidate_indexes)

        # select best configuration/super-arm based on C^2 LinUCB
        if verbose: print(f"Selecting best configuration...")
        selected_indexes = self.select_best_configuration(self.context_vectors, candidate_indexes)

        # materialize the selected indexes
        if verbose: print(f"Materializing selected indexes...")
        self.materialize_indexes(selected_indexes)

        # execute the mini workload and observe bandit arm rewards
        if verbose: print(f"Executing mini workload...")
        ...

        # update the LinUCB model parameters
        ...



    # identify new query templates from the mini workload and update stats
    def identify_new_query_templates(self, mini_workload):
        for query in mini_workload:
            if query.template_id not in self.query_store:
                # add to query store
                self.query_store[query.template_id] = query
                self.query_store[query.template_id].frequency = 1
                self.query_store[query.template_id].first_seen = self.current_round
            else:
                # update stats    
                self.query_store[query.template_id].frequency += 1
                self.query_store[query.template_id].query_string = query.query_string   # keep most recent query string
            
            self.query_store[query.template_id].last_seen = self.current_round


    # select queries of interest (QoIs) from past workload and use them to extract candidate indexes, i.e. bandit arms
    def select_queries_of_interest(self, mini_workload):
        # select queries of interest (QoIs) from past workload and use them to extract candidate indexes, i.e. bandit arms
        QoIs = []
        for query in self.query_store.values():
            # select queries that have been seen in the last qoi_memory rounds, excluding the current round
            if self.current_round - query.last_seen <= self.qoi_memory and query.last_seen != self.current_round:
                QoIs.append(query)

        return QoIs        


    # extract candidate indexes from QoIs
    def extract_candidate_indexes(self, QoIs):
        # extract candidate indexes from QoIs
        candidate_indexes = {}
        for query_object in QoIs:
            # extract indexes from the query
            indexes = extract_query_indexes(query_object,  self.MAX_INDEX_COLUMNS, self.INCLUDE_COLS)
            for index in indexes:
                if index not in candidate_indexes:
                    candidate_indexes[index.index_id] = index

        return candidate_indexes   


    # generate context vectors for each candidate index
    def generate_context_vectors(self, candidate_indexes):
        # generate column context
        column_context_vectors = self.generate_column_context(candidate_indexes)
        # generate derived context
        derived_context_vectors = self.generate_derived_context(candidate_indexes)
        # concatenate column and derived context vectors
        context_vectors = np.hstack((derived_context_vectors, column_context_vectors))

        return context_vectors     


    # generate column context vectors
    def generate_column_context(self, candidate_indexes):
        column_context_vectors = []
        for index in candidate_indexes.values():
            if index.index_id in self.column_context_cache:
                # check if column context is already cached
                column_context = self.column_context_cache[index.index_id]
            else:
                # create separate encoding segments for index columns and include columns    
                index_column_context_vector = np.zeros(len(columns_to_idx), dtype=float)
                include_column_context_vector = np.zeros(len(columns_to_idx), dtype=float)

                for position, column in enumerate(index.index_columns):
                    # encode index columns with exponentially decreasing weight based on position (since order matters)
                    index_column_context_vector[self.columns_to_idx[column]] = 1 / (10**position)

                for position, column in enumerate(index.include_columns):
                    # encode include columns with uniform weights (since order doesn't matter)
                    include_column_context_vector[self.columns_to_idx[column]] = 1

                # concatenate index columns and include columns
                column_context = np.hstack((index_column_context_vector, include_column_context_vector))    

                # cache the column context
                self.column_context_cache[index.index_id] = column_context

            column_context_vectors.append(column_context)
    
        column_context_vectors = np.vstack(column_context_vectors)

        return column_context_vectors


    # generate derived context vectors
    def generate_derived_context(self, candidate_indexes):
        # get hypothetical sizes of all new candidate indexes not in the cache
        new_indexes = [index for index in candidate_indexes.values() if index.index_id not in self.index_size]
        if new_indexes:
            conn = create_connection()
            self.index_size[index.index_id] = get_hypothetical_index_sizes(conn, new_indexes)
            close_connection(conn)    

        derived_context_vectors = []
        for index in candidate_indexes.values():
            derived_context = np.zeros(2, dtype=float)
            derived_context[0] = self.index_average_reward[index.index_id]

            if index.index_id not in self.selected_indexes_last_round:
                derived_context[1] = self.index_size[index.index_id] / self.database_size
            else:
                # if candidate index was selected in the last round, then set to 0   
                derived_context[1] = 0
            derived_context_vectors.append(derived_context)

        derived_context_vectors = np.vstack(derived_context_vectors)

        return derived_context_vectors


    # select best configuration/super-arm using C^2 LinUCB
    def select_best_configuration(self, context_vectors, candidate_indexes, creation_cost_reduction_factor=1):
        # compute linUCB parameters
        V_inv = np.linalg.inv(self.V)
        theta = V_inv @ self.b
        # rescale the parameter corresponding to second component of the derived context vector
        theta[1] = theta[1]/creation_cost_reduction_factor
        # compute expected rewards
        expected_rewards = (context_vectors @ theta).reshape(-1)
        # estimate upper confidence bound/variance
        variances = self.alpha * np.sqrt(np.diag(context_vectors @ V_inv @ context_vectors.T))
        # compute upper bounds
        upper_bounds = expected_rewards + variances
        # convert to dict
        upper_bounds = {index_id: upper_bound for index_id, upper_bound in zip(candidate_indexes.keys(), upper_bounds)}
        # solve 0-1 knapsack problem to select best configuration
        selected_indexes = self.solve_knapsack(upper_bounds, candidate_indexes)

        return selected_index


    # greedy 1/2 approximation algorithm for knapsack problem
    def solve_knapsack(self, upper_bounds, candidate_indexes):
        # compute estimated reward upper bound to index size ratio
        ratios = {index_id: upper_bound / self.index_size[index_id] for index_id, upper_bound in upper_bounds.items()}
        # sort indexes by decreasing order of ratio
        sorted_indexes = sorted(ratios, key=ratios.get, reverse=True)
        # select indexes greedily to fit within memory budget
        selected_indexes = []
        memory_used = 0
        for index_id in sorted_indexes:
            if memory_used + self.index_size[index_id] <= self.config_memory_MB:
                selected_indexes.append(candidate_indexes[index_id])
                memory_used += self.index_size[index_id]
            if memory_used >= self.config_memory_MB:
                break 

        return selected_indexes


    # materialize the selected indexes
    def materialize_indexes(self, selected_indexes):
        indexes_added = set(selected_indexes) - set(self.selected_indexes_last_round)
        indexes_dropped = set(self.selected_indexes_last_round) - set(selected_indexes)

        # drop indexes that are no longer selected
        conn = create_connection()
        drop_index(conn, indexes_dropped)
        close_connection(conn)

        # materialize the selected indexes
        conn = create_connection()
        bulk_create_indexes(conn, indexes_added)
        close_connection(conn)

        # update the cached index sizes with actual sizes
        for index in indexes_added:
            self.index_size[index.index_id] = index.size












