#### Generating index selection recommendations via MS SQL Database Tuning Adviser (DTA)

We will use DTA to generate index recommendations on some sample TPC-H OLAP workloads. 

Given a workload containing N queries, we will split it up into m rounds (for simplicity we will split up evenly into disjoint subsets, we can also do overlapping). 

* Experiment 1: On each round, we will execute the queries in that round and measure performance.

* Experiment 2: On each round, we will first use DTA to obtain recommendations, implement those recommendation (i.e. create/drop indices etc.), then execute the queries in that round and measure performance. To generate recommendations, we will use all queries that have been seen up to and including queries in the current round.  


In [1]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import os
import random
import pandas as pd
import time
from tqdm import tqdm
import logging
import re
import json
import xml.etree.ElementTree as ET

%load_ext autoreload
%autoreload 2

import IPython
notebook_path = IPython.get_ipython().starting_dir
target_subdirectory_path = os.path.abspath(os.path.join(os.path.dirname(notebook_path), 'database'))
sys.path.append(target_subdirectory_path)
from utils import *

In [2]:
# read workload queries from JSON file
def read_workload(workload_filepath):
    workload = []
    with open(workload_filepath) as f:
        line = f.readline()
        # read the queries from each line
        while line:
            workload.append(json.loads(line))
            line = f.readline()

    return workload

# Base directory containing the generated queries
workload_filepath = '../datagen/TPCH_workloads/TPCH_static_100_workload.json'

# Read the workload queries from file
workload = read_workload(workload_filepath)
print(len(workload))

2100


#### Define DTA recommender class.

In [3]:
# test - execute a query
query = workload[0]['query_string']
connection = start_connection()
execute_query(query, connection, cost_type='elapsed_time', verbose=True)
close_connection(connection)

QUERY: 
select
	l_returnflag,
	l_linestatus,
	sum(l_quantity) as sum_qty,
	sum(l_extendedprice) as sum_base_price,
	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
	avg(l_quantity) as avg_qty,
	avg(l_extendedprice) as avg_price,
	avg(l_discount) as avg_disc,
	count(*) as count_order
from
	lineitem
where
	l_shipdate <= DATEADD(dd, -84, CAST('1998-12-01' AS date))
group by
	l_returnflag,
	l_linestatus
order by
	l_returnflag,
	l_linestatus
;

ELAPSED TIME: 
2.014

CPU TIME: 
11727.0

SUBTREE COST: 
78.4947

NON CLUSTERED INDEX USAGE: 
[]

CLUSTERED INDEX USAGE: 
[('lineitem', 1.797, 11727.0, 78.4947, 6001215, 5924530.0)]



In [4]:
class DTA_recommender:
    def __init__(self, queries, invoke_ta_rounds, verbose=False):
        self.queries = queries # list of queries to be used as workload
        self.invoke_ta_rounds = invoke_ta_rounds # list specifying the rounds to invoke TA
        self.verbose = verbose
        self.server="172.16.6.196,1433"
        self.database="TPCH1" 
        self.username="wsl" 
        self.password="greatpond501"
        

    def run_dta(self, num_rounds=1, invoke_DTA=True, clear_indexes_start=False, clear_indexes_end=True):

        # establish connection to the database
        self.conn = start_connection()

        # clear all non-clustered indexes at the start
        if clear_indexes_start:   
            remove_all_nonclustered_indexes(self.conn, self.verbose)
  
        if num_rounds > 0:

            # reset workload file
            self.workload_file = "workload_tpch1.sql"
            open(self.workload_file, 'w').close()

            num_queries_per_round = len(self.queries) // num_rounds

            # iterate over rounds
            counter = 0
            for i in range(num_rounds):
                print(f"Round {i+1} of {num_rounds}")
                current_round_queries = self.queries[i*num_queries_per_round:(i+1)*num_queries_per_round]
                
                if invoke_DTA:
                    # write the queries for current round to the workload file
                    with open(self.workload_file, 'a+') as file:
                        for query in current_round_queries:
                            query_string = query['query_string']
                            # exclude queries with "view" in them, otherwise DTA will throw a syntax error
                            if "view" not in query_string.lower():
                                file.write(query_string)
                                file.write('\n\n\n')
                                counter += 1

                    print(f"{counter} queries written on workload file")

                    # invoke DTA if current round is in invoke_ta_rounds
                    if i in self.invoke_ta_rounds:
                        recommendation_cost_round, recommmendation_output_file = self.get_recommendations()      
                        if os.path.isfile(recommmendation_output_file):
                            self.implement_recommendations(recommmendation_output_file)
                            # reset the workload file
                            open(self.workload_file, 'w').close()
                            counter = 0

                # now execute the workload for current round
                execution_cost_round = self.execute_workload(current_round_queries)        

            # clear all indexes
            if clear_indexes_end: remove_all_nonclustered_indexes(self.conn, self.verbose)
            # clear out all the recommendations and session files from directory
            for file in os.listdir():
                if file.startswith("recommendations") or file.startswith("session_output"):
                    os.remove(file)

        # close the connection
        close_connection(self.conn)

        
    def get_recommendations(self):
        session_name = f"session_{uuid.uuid4()}"
        max_memory = 4*1024 # MB
        max_time = 1 # minutes
        recommendation_output_file = f"recommendations_{session_name}.sql"
        session_output_xml_file = f"session_output_{session_name}.xml"        
        dta_exe_path = '"/mnt/c/Program Files (x86)/Microsoft SQL Server Management Studio 20/Common7/DTA.exe"'
        dta_command = f'{dta_exe_path} -S 172.16.6.196 -U wsl -P greatpond501 -D {self.database} -d {self.database} ' \
                    f'-if "{self.workload_file}" -s {session_name} ' \
                    f'-of "{recommendation_output_file}" ' \
                    f'-ox "{session_output_xml_file}" ' \
                    f'-fa NCL_IDX -fp NONE -fk CL_IDX -B {max_memory} -A {max_time} -F'

        start_time = datetime.datetime.now()
        subprocess.run(dta_command, shell=True)
        end_time = datetime.datetime.now()
        time_elapsed = (end_time - start_time).total_seconds()
        
        print(f"DTA recommendation time --> {time_elapsed} seconds.")
                  
        return time_elapsed, recommendation_output_file     


    def implement_recommendations(self, recommendation_output_file):
        if self.verbose: print("Implementing recommendations...")
        try:
            with open(recommendation_output_file, 'r', encoding="utf-16") as file:
                query_lines = file.readlines()
                sql = ' '.join(query_lines)
                sql = sql.replace('go\n', ';')
        except Exception as e:
            print(f"Error reading recommendations file: {e}")
            return 0                    

        recommendation_queries = sql.split(';')
        #if self.verbose:
        #    print(f"Recommendation queries: \n{recommendation_queries}")
        
        total_index_creation_cost = 0
        for query in recommendation_queries[1:]:
            if not query.isspace():
                if "create nonclustered index" in query.lower():
                    total_index_creation_cost += create_nonclustered_index_query(query, self.conn, verbose=self.verbose) 
                elif "drop index" in query.lower():
                    drop_nonclustered_index(self.conn, query=query, verbose=self.verbose)

        print(f"Implemented recommendations.")
        print(f"Total index creation time --> {total_index_creation_cost} seconds. Total size of configuration --> {get_current_pds_size(self.conn)} MB")

        return total_index_creation_cost


    def execute_workload(self, workload):
        if self.verbose:
            print(f"Executing workload of {len(workload)} queries")
        total_elapsed_time = 0
        # execute the workload
        for query in workload:
            cost, index_seeks, clustered_index_scans = execute_query(query['query_string'], self.conn)
            total_elapsed_time += cost   
        print(f"Current round workload execution time --> {total_elapsed_time} seconds.")     

        return total_elapsed_time



In [5]:
# test dta
dta_recommender = DTA_recommender(workload[:105], [0, 2], verbose=False)
dta_recommender.run_dta(num_rounds=5, invoke_DTA=True, clear_indexes_start=True, clear_indexes_end=True)

All non-clustered indexes --> []
Round 1 of 5
21 queries written on workload file
Microsoft (R) SQL Server dta
Version 20.2.30.0
Copyright (c) Microsoft. All rights reserved.

Tuning session successfully created. Session ID is 55.

Time elapsed: 00:00:10            
Workload consumed:  100%, Estimated improvement:    0%                         
Time elapsed: 00:00:40            
Workload consumed:  100%, Estimated improvement:    0%                         
Time elapsed: 00:00:53            
Workload consumed:  100%, Estimated improvement:   75%                         
Time elapsed: 00:01:02            
Workload consumed:  100%, Estimated improvement:   79%                         
Total time used: 00:01:02                                


                                                                               
                                                                                
All events in the workload were not analysed. Check tuning log for more information
Tun

In [36]:
# now run without invoking DTA, the query execution times should be higher
dta_recommender.run_dta(num_rounds=5, clear_indexes_start=True, invoke_DTA=False)

All non-clustered indexes --> []
All nonclustered indexes removed.
Round 1 of 5
Executing workload of 21 queries
Current round workload execution time --> 24.636 seconds.
Round 2 of 5
Executing workload of 21 queries
Current round workload execution time --> 26.837000000000007 seconds.
Round 3 of 5
Executing workload of 21 queries
Current round workload execution time --> 25.334 seconds.
Round 4 of 5
Executing workload of 21 queries
Current round workload execution time --> 25.929999999999996 seconds.
Round 5 of 5
Executing workload of 21 queries
Current round workload execution time --> 25.415 seconds.
All non-clustered indexes --> []
All nonclustered indexes removed.
