#### Online Index Selection using the Work Function Algorithm (WFA)

In [None]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET
import itertools
import math
from collections import defaultdict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

#### First, we will define some helper functions for generating list of all possible configurations subject to constraints (i.e. max memory usage, max number of columns per index), along with cost estimation (such as transition costs and query execution cost in a hypothetical configuration). We will also precompute estimates of all index creation costs.

In [None]:
class Index:
    def __init__(self, table_name, index_id, index_columns, size, include_columns=(), value=None):
        self.table_name = table_name
        self.index_id = index_id
        self.index_columns = index_columns
        self.size = size
        self.include_columns = include_columns
        self.value = value

    def __str__(self):
        return f"Index({self.table_name}, {self.index_id}, {self.index_columns}, {self.include_columns}, {self.size}, {self.value})"


"""
    Function for generating all possible configurations (i.e. subsets of indixes) and also precomputing index creation cost estimates
"""
def generate_all_configurations(connection, MAX_SIZE=1024, MAX_KEY_COLS=3, MAX_INCLUDE_COLS=3, verbose=False):
    # first, generate all possible indices
    tables = get_all_tables(connection)
    all_indices = {} 
    # tqdm bar around table loop
    for table_name, table in tqdm(tables.items(), desc="Generating all indices for table:"):
        #print(f"Table --> {table}")
        columns = table.get_columns()
        #print(f"Columns --> {columns}")    
        if verbose:
            print(f"Table --> {table_name} with columns --> {columns}")
        # get all possible permutations of columns, up to MAX_KEY_COLS columns
        for num_columns in range(1, min(MAX_KEY_COLS, len(columns)+1)):
            col_permutations = list(itertools.permutations(columns, num_columns))
            # also generate permutations of columns with include columns
            for cp in col_permutations:
                # get columns not in cp
                include_columns = list(set(columns) - set(cp))
                # get all permutations of include columns up to MAX_INCLUDE_COLS columns
                include_col_permutations = list(itertools.permutations(include_columns, MAX_INCLUDE_COLS))
                for icp in include_col_permutations:
                    index_id = get_index_id(cp, table_name, include_columns)
                    if index_id not in all_indices:
                        index_size = get_estimated_index_size(connection, table_name, list(cp) + include_columns)
                        # create index object
                        all_indices[index_id] = Index(table_name, index_id, cp, index_size, tuple(icp))

    print(f"Total number of indices generated: {len(all_indices)}, total estimated size: {sum([i.size for i in all_indices.values()])} Mb")

    # now estimate the creation cost of each index (we this by creating the index and then dropping it, which is potentially 
    # very expensive, but I don't know a more efficient way)
    index_creation_cost = {}
    for index_id, index in tqdm(all_indices.items(), desc="Estimating index creation cost:"):
        index_creation_cost[index_id] = create_nonclustered_index_object(connection, index)
        drop_noncluster_index_object(connection, index)


    # now generate all possible configurations within the MAX_SIZE limit
    all_configurations = []





