#### Measuring No-Index Query Execution Time

For the TPCH-1GB database, we will now run batches of queries, i.e. small workloads, and measure the query execution time per workload. 


In [1]:
import pyodbc
import sys
import random
import pandas as pd
import time
import os
from tqdm import tqdm
import logging
import re



#### Read all the generated sample queries and store them in a list.

In [2]:
def read_sql_files(base_dir, instances_per_template):
    queries = []
    
    # Regex to find erroneous "where rownum <=" lines
    erroneous_line_pattern = re.compile(r'^\s*where\s+rownum\s*<=\s*\d+\s*;\s*$', re.IGNORECASE)

    # Loop through each query directory
    for query_dir in sorted(os.listdir(base_dir)):
        query_path = os.path.join(base_dir, query_dir)
        
        if os.path.isdir(query_path):
            # Initialize a counter for the number of instances read from this template
            instance_count = 0
            
            # Loop through each SQL file in the query directory
            for sql_file in sorted(os.listdir(query_path)):
                if instance_count >= instances_per_template:
                    break  # Stop reading more files from this template
                
                sql_file_path = os.path.join(query_path, sql_file)
                
                if sql_file_path.endswith('.sql'):
                    with open(sql_file_path, 'r') as file:
                        lines = file.readlines()
                        
                        # Filter out the erroneous "where rownum <=" lines
                        filtered_lines = [line for line in lines if not erroneous_line_pattern.match(line)]
                        
                        # Extract the query from the filtered lines
                        query = ''.join(filtered_lines[3:]).strip()
                        
                        queries.append(query)
                        instance_count += 1  # Increment the counter
    
    return queries


# Base directory containing the generated queries
base_dir = './TPCH_generated_queries'

# Read the SQL files and store the queries in a list
queries = read_sql_files(base_dir, instances_per_template=20)

print(len(queries))

440


In [3]:
conn_str = (
    "Driver={ODBC Driver 17 for SQL Server};"
    "Server=172.16.6.196,1433;"  # Use the IP address and port directly
    "Database=TPCH1;"  
    "UID=wsl;" 
    "PWD=greatpond501;"  
)

conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
# test the connection
print(cursor.execute("SELECT @@version;"))


def execute_queries(workload):
    # Establish a connection to the database
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    start_time = time.time()
    for i, query in enumerate(workload):
        try:
            cursor.execute(query)
            conn.commit()
            #print(f"Query {i+1} executed successfully.")
        except Exception as e:
            print(f"Error executing Query {i+1}: {e}")
    
    end_time = time.time()
    elapsed_time = end_time - start_time

    cursor.close()
    conn.close()
    return elapsed_time

<pyodbc.Cursor object at 0x7fdc7aab91b0>


In [4]:
# shuffle the queries list
random.shuffle(queries)

batch_size = 10

# now run the queries in batches and get execution times
workload_times = []
for i in tqdm(range(0, len(queries), batch_size), desc="Executing Batches"):
    batch = queries[i:i+batch_size]
    elapsed_time = execute_queries(batch)
    workload_times.append(elapsed_time)


Executing Batches:   7%|▋         | 3/44 [00:58<13:37, 19.93s/it]