#### Generating index selection recommendations via MS SQL Database Tuning Adviser (DTA)

We will use DTA to generate index recommendations on some sample TPC-H OLAP workloads. 

Given a workload containing N queries, we will split it up into m rounds (for simplicity we will split up evenly into disjoint subsets, we can also do overlapping). 

* Experiment 1: On each round, we will execute the queries in that round and measure performance.

* Experiment 2: On each round, we will first use DTA to obtain recommendations, implement those recommendation (i.e. create/drop indices etc.), then execute the queries in that round and measure performance. To generate recommendations, we will use all queries that have been seen up to and including queries in the current round.  


In [1]:
import logging
import datetime
import os
import subprocess
import uuid

import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re


In [2]:
def read_sql_files(base_dir, instances_per_template):
    queries = []
    
    # Regex to find erroneous "where rownum <=" lines
    erroneous_line_pattern = re.compile(r'^\s*where\s+rownum\s*<=\s*\d+\s*;\s*$', re.IGNORECASE)

    # Loop through each query directory
    for query_dir in sorted(os.listdir(base_dir)):
        query_path = os.path.join(base_dir, query_dir)
        
        if os.path.isdir(query_path):
            # Initialize a counter for the number of instances read from this template
            instance_count = 0
            
            # Loop through each SQL file in the query directory
            for sql_file in sorted(os.listdir(query_path)):
                if instance_count >= instances_per_template:
                    break  # Stop reading more files from this template
                
                sql_file_path = os.path.join(query_path, sql_file)
                
                if sql_file_path.endswith('.sql'):
                    with open(sql_file_path, 'r') as file:
                        lines = file.readlines()
                        
                        # Filter out the erroneous "where rownum <=" lines
                        filtered_lines = [line for line in lines if not erroneous_line_pattern.match(line)]
                        
                        # Extract the query from the filtered lines
                        query = ''.join(filtered_lines[3:]).strip()
                        
                        queries.append(query)
                        instance_count += 1  # Increment the counter
    
    return queries


# Base directory containing the generated queries
base_dir = '../TPCH_generated_queries'

# Read the SQL files and store the queries in a list
queries = read_sql_files(base_dir, instances_per_template=20)

print(len(queries))

440


#### Create a workload file with all the queries.

In [6]:
workload_filename = 'workload_tpch_20.sql'

# Write the queries to file
with open(workload_filename, 'w') as file:
    for query in queries:
        file.write(query + '\n\n')
        

#### Define DTA recommender class.

In [3]:
conn_str = (
    "Driver={ODBC Driver 17 for SQL Server};"
    "Server=172.16.6.196,1433;"  # Use the IP address and port directly
    "Database=TPCH1;"  
    "UID=wsl;" 
    "PWD=greatpond501;"  
)

conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
# test the connection
print(cursor.execute("SELECT @@version;"))

<pyodbc.Cursor object at 0x7f144f9c9530>


In [8]:
class DTA_recommender:
    def __init__(self, queries, invoke_ta_rounds, verbose=False):
        self.queries = queries # list of queries to be used as workload
        self.invoke_ta_rounds = invoke_ta_rounds # list specifying the rounds to invoke TA
        self.verbose = verbose
        self.conn_string = ("Driver={ODBC Driver 17 for SQL Server};"
                            "Server=172.16.6.196,1433;"  # Use the IP address and port directly
                            "Database=TPCH1;"  
                            "UID=wsl;" 
                            "PWD=greatpond501;")
        
        self.server = "172.16.6.196,1433"
        self.database = "TPCH1"
        self.username = "wsl"
        self.password = "greatpond501"

        
    def run_dta(self, num_rounds=1):

        # establish connection to the database
        self.conn = pyodbc.connect(self.conn_string)

        # reset workload file
        self.workload_file = "workload_tpch1.sql"
        open(self.workload_file, 'w').close()

        num_queries_per_round = len(self.queries) // num_rounds

        # iterate over rounds
        for i in range(num_rounds):
            current_round_queries = self.queries[i*num_queries_per_round:(i+1)*num_queries_per_round]
            # write the queries for current round to the workload file
            with open(self.workload_file, 'a') as file:
                for query in current_round_queries:
                    file.write(query + '\n\n')

            # invoke DTA if current round is in invoke_ta_rounds
            if i in self.invoke_ta_rounds:
                recommendation_cost_round, recommmendation_output_file = self.get_recommendations()      
                if os.path.isfile(recommmendation_output_file):
                    self.implement_recommendations(recommmendation_output_file)


        # close the connection
        self.conn.close()

        
    def get_recommendations(self):
        session_name = f"session_{uuid.uuid4()}"
        max_memory = 1024 # MB
        max_time = 60 # seconds
        recommendation_output_file = f"recommendations_{session_name}.sql"
        session_output_xml_file = f"session_output_{session_name}.xml"        
        dta_exe_path = '"/mnt/c/Program Files (x86)/Microsoft SQL Server Management Studio 20/Common7/DTA.exe"'
        dta_command = f'{dta_exe_path} -S 172.16.6.196 -U wsl -P greatpond501 -D {self.database} -d {self.database} ' \
                    f'-if "{self.workload_file}" -s {session_name} ' \
                    f'-of "{recommendation_output_file}" ' \
                    f'-ox "{session_output_xml_file}" ' \
                    f'-fa NCL_IDX -fp NONE -fk CL_IDX -B {max_memory} -A {max_time} -F'

        start_time = datetime.datetime.now()
        subprocess.run(dta_command, shell=True)
        end_time = datetime.datetime.now()
        time_elapsed = (end_time - start_time).total_seconds()
        
        if self.verbose:
            print(f"DTA took {time_elapsed} seconds. Recommendations:\n")
            # print the recommendations
            with open(recommendation_output_file, 'r') as file:
                recommendations = file.readlines()
                for recommendation in recommendations:
                    print(recommendation)
                  
        return time_elapsed, recommendation_output_file     


    def implement_recommendations(self, recommendation_output_file):
        with open(recommendation_output_file, 'r', encoding="utf-16") as file:
            query_lines = file.readlines()
        sql = ' '.join(query_lines)
        sql = sql.replace('go\n', ';')            


        #cursor = self.conn.cursor()
        #cursor.execute(sql_query)


In [9]:
# test dta
dta_recommender = DTA_recommender(queries, [0], verbose=True)
dta_recommender.run_dta(num_rounds=1)

Microsoft (R) SQL Server dta
Version 20.2.30.0
Copyright (c) Microsoft. All rights reserved.

Tuning session successfully created. Session ID is 1.

Time elapsed: 00:00:00            
Workload consumed:    0%, Estimated improvement:    0%                         
Total time used: 00:00:00                                


                                                                               
                                                                                
Tuning process finished.


The minimum storage space required for the existing physical design structures (PDS) you have selected to keep is larger than the storage space provided. Choose fewer PDS to keep, or set the storage space to be larger than 1173 MBs.


DTA took 18.633039 seconds. Recommendations:



FileNotFoundError: [Errno 2] No such file or directory: 'recommendations_session_d98079ec-9406-4724-8988-e05e64f33b96.sql'