#### Generating index selection recommendations via MS SQL Database Tuning Adviser (DTA)

We will use DTA to generate index recommendations on some sample TPC-H OLAP workloads. 

Given a workload containing N queries, we will split it up into m rounds (for simplicity we will split up evenly into disjoint subsets, we can also do overlapping). 

* Experiment 1: On each round, we will execute the queries in that round and measure performance.

* Experiment 2: On each round, we will first use DTA to obtain recommendations, implement those recommendation (i.e. create/drop indices etc.), then execute the queries in that round and measure performance. To generate recommendations, we will use all queries that have been seen up to and including queries in the current round.  


In [2]:
import logging
import datetime
import os
import subprocess
import uuid
import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re


In [3]:
def read_sql_files(base_dir, instances_per_template):
    queries = []
    
    # Regex to find erroneous "where rownum <=" lines
    erroneous_line_pattern = re.compile(r'^\s*where\s+rownum\s*<=\s*\d+\s*;\s*$', re.IGNORECASE)

    # Loop through each query directory
    for query_dir in sorted(os.listdir(base_dir)):
        query_path = os.path.join(base_dir, query_dir)
        
        if os.path.isdir(query_path):
            # Initialize a counter for the number of instances read from this template
            instance_count = 0
            
            # Loop through each SQL file in the query directory
            for sql_file in sorted(os.listdir(query_path)):
                if instance_count >= instances_per_template:
                    break  # Stop reading more files from this template
                
                sql_file_path = os.path.join(query_path, sql_file)
                
                if sql_file_path.endswith('.sql'):
                    with open(sql_file_path, 'r') as file:
                        lines = file.readlines()
                        
                        # Filter out the erroneous "where rownum <=" lines
                        filtered_lines = [line for line in lines if not erroneous_line_pattern.match(line)]
                        
                        # Extract the query from the filtered lines
                        query = ''.join(filtered_lines[3:]).strip()
                        
                        queries.append(query)
                        instance_count += 1  # Increment the counter
    
    return queries


# Base directory containing the generated queries
base_dir = '../TPCH_generated_queries'

# Read the SQL files and store the queries in a list
queries = read_sql_files(base_dir, instances_per_template=20)

print(len(queries))

440


#### Create a workload file with all the queries.

In [6]:
workload_filename = 'workload_tpch_20.sql'

# Write the queries to file
with open(workload_filename, 'w') as file:
    for query in queries:
        file.write(query + '\n\n')
        

#### Define DTA recommender class.

In [None]:
conn_str = (
    "Driver={ODBC Driver 17 for SQL Server};"
    "Server=172.16.6.196,1433;"  # Use the IP address and port directly
    "Database=TPCH1;"  
    "UID=wsl;" 
    "PWD=greatpond501;"  
)

conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
# test the connection
print(cursor.execute("SELECT @@version;"))

In [None]:
class DTA_recommender:
    def __init__(self, workload_filename, dta_script_path, dta_output_path, dta_log_path):
        self.
        
    def run_dta(self):
        # Run the DTA script
        subprocess.run(['python', self.dta_script_path, self.workload_filename, self.dta_output_path], check=True)
        
    def get_recommendations(self):
        # Read the recommendations from the DTA output file
        recommendations = []
        with open(self.dta_output_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                if line.startswith('Recommendation:'):
                    query_id = int(line