#### Simple prediction of query execution cost (assume disk IO dominates) and access paths 

In [1]:
import psycopg2
import numpy as np
from pg_utils import *

#### Getting Table Statistics

In [2]:
conn = create_connection()

cur = conn.cursor()

table_name = 'dwdate' #'customer'

# Execute the query to get the estimated number of rows in the 'customer' table
cur.execute(f"""
            SELECT reltuples::bigint AS estimated_rows
            FROM pg_class
            WHERE relname = '{table_name}';
            """)
estimated_rows = cur.fetchone()[0]

# Query to get column statistics
cur.execute(f"SELECT * FROM pg_stats WHERE tablename = '{table_name}';")
column_stats = cur.fetchall()

# Define the column names based on the pg_stats view
column_names = [
                "schemaname", "tablename", "attname", "inherited", "null_frac",
                "avg_width", "n_distinct", "most_common_vals", "most_common_freqs",
                "histogram_bounds", "correlation", "most_common_elems",
                "most_common_elem_freqs", "elem_count_histogram"
               ]

# Organize the results into a dictionary
stats_dict = {}
for row in column_stats:
    column_name = row[2]  # 'attname' is the third column in the result
    stats_dict[column_name] = {column_names[i]: row[i] for i in range(len(column_names))}

# Close the cursor and connection
cur.close()
close_connection(conn)


# Print the organized statistics dictionary
for key, value in stats_dict.items():
    print(f"{key}")
    for k, v in value.items():
        print(f"    {k}: {v}")

print(f"\nEstimated number of rows in the 'customer' table: {estimated_rows}")

d_datekey
    schemaname: public
    tablename: dwdate
    attname: d_datekey
    inherited: False
    null_frac: 0.0
    avg_width: 4
    n_distinct: -1.0
    most_common_vals: None
    most_common_freqs: None
    histogram_bounds: {1992-01-01,1992-01-26,1992-02-21,1992-03-17,1992-04-12,1992-05-07,1992-06-02,1992-06-27,1992-07-23,1992-08-17,1992-09-12,1992-10-08,1992-11-02,1992-11-28,1992-12-23,1993-01-18,1993-02-12,1993-03-10,1993-04-04,1993-04-30,1993-05-26,1993-06-20,1993-07-16,1993-08-10,1993-09-05,1993-09-30,1993-10-26,1993-11-20,1993-12-16,1994-01-10,1994-02-05,1994-03-03,1994-03-28,1994-04-23,1994-05-18,1994-06-13,1994-07-08,1994-08-03,1994-08-28,1994-09-23,1994-10-19,1994-11-13,1994-12-09,1995-01-03,1995-01-29,1995-02-23,1995-03-21,1995-04-15,1995-05-11,1995-06-05,1995-07-01,1995-07-27,1995-08-21,1995-09-16,1995-10-11,1995-11-06,1995-12-01,1995-12-27,1996-01-21,1996-02-16,1996-03-13,1996-04-07,1996-05-03,1996-05-28,1996-06-23,1996-07-18,1996-08-13,1996-09-07,1996-10-03,1996-10

#### Estimating Selectivity for a value range on a particular column, i.e. what fraction of the data (i.e. tuples) fall in the given range, using the Table Statistics

In [3]:
def estimate_selectivity_range(attribute, value_range, stats_dict, total_rows, data_type='numeric'):
    # get the column statistics
    stats = stats_dict[attribute]
    # get the histogram bounds
    histogram_bounds = stats['histogram_bounds']
    n_distinct = stats['n_distinct']
    most_common_vals = stats['most_common_vals']
    most_common_freqs = stats['most_common_freqs']

    # convert most_common_values string to list of correct data type
    if most_common_vals:
        if data_type == 'numeric':
            most_common_vals = [float(x) for x in most_common_vals.strip('{}').split(',')]
        elif data_type == 'char':
            most_common_vals = [x for x in most_common_vals.strip('{}').split(',')]    
        else:
            raise ValueError("Data type not supported, needs ot be either numeric or char")

    # Convert negative n_distinct to an absolute count
    if n_distinct < 0:
        n_distinct = -n_distinct * total_rows

    min_value = value_range[0]
    max_value = value_range[1]
    selectivity = 0.0

    # check for overlap with most common values
    if most_common_vals:
        for val, freq in zip(most_common_vals, most_common_freqs):
            if min_value <= val <= max_value:
                selectivity += freq    

    if histogram_bounds is not None:
        if data_type == 'numeric':
            histogram_bounds = [float(x) for x in histogram_bounds.strip('{}').split(',')] # convert to list of integers
        elif data_type == 'char':
            histogram_bounds = [x for x in histogram_bounds.strip('{}').split(',')]
        else:
            raise ValueError("Data type not supported, needs ot be either numeric or char")    

        total_bins = len(histogram_bounds) - 1

        # iterate over bins, find overlapping bins
        for i in range(total_bins):
            bin_lower_bound = histogram_bounds[i]
            bin_upper_bound = histogram_bounds[i+1]

            # check for range overlap
            if min_value < bin_lower_bound or max_value > bin_upper_bound:
                # does not overlap
                continue    

            # calculate the overlap fraction within this bin
            overlap_min = max(min_value, bin_lower_bound)
            overlap_max = min(max_value, bin_upper_bound)
            overlap_fraction = (overlap_max - overlap_min) / (bin_upper_bound - bin_lower_bound)

            #print(f"Overlap fraction for bin {i}: {overlap_fraction}")
            #print(f"Bin bounds: {bin_lower_bound}, {bin_upper_bound}")

            # accumulate to the total selectivity
            # Assume each bin represents an equal fraction of the total rows
            selectivity += overlap_fraction * (1.0 / total_bins)

    if selectivity == 0.0:
        # if no overlap with most common values or histogram bins, assume uniform distribution and estimate selectivity
        selectivity = 1.0 / n_distinct       

    return selectivity


def estimate_selectivity_eq(attribute, value, stats_dict, data_type='numeric'):
    # get the column statistics
    stats = stats_dict[attribute]
    # get the histogram bounds
    histogram_bounds = stats['histogram_bounds']
    n_distinct = stats['n_distinct']
    most_common_vals = stats['most_common_vals']
    most_common_freqs = stats['most_common_freqs']

    # convert most_common_values string to list of correct data type
    if most_common_vals:
        if data_type == 'numeric':
            most_common_vals = [float(x) for x in most_common_vals.strip('{}').split(',')]
        elif data_type == 'char':
            most_common_vals = [x for x in most_common_vals.strip('{}').split(',')]    
        else:
            raise ValueError("Data type not supported, needs ot be either numeric or char")

    # first check if the value is in the most common values
    if most_common_vals and value in most_common_vals:
        selectivity = most_common_freqs[most_common_vals.index(value)] 
        return selectivity

    # if not a common value, estimate using n_distinct
    if n_distinct < 0:
        n_distinct = -n_distinct

    selectivity = 1.0 / n_distinct    

    if histogram_bounds is not None:
        if data_type == 'numeric':
            histogram_bounds = [float(x) for x in histogram_bounds.strip('{}').split(',')] # convert to list of integers
        elif data_type == 'char':
            histogram_bounds = [x for x in histogram_bounds.strip('{}').split(',')]
        else:
            raise ValueError("Data type not supported, needs ot be either numeric or char")    

        total_bins = len(histogram_bounds) - 1

        # iterate over bins, find bin that contains the value
        for i in range(total_bins):
            bin_lower_bound = histogram_bounds[i]
            bin_upper_bound = histogram_bounds[i+1]

            # check for range overlap
            if bin_lower_bound <= value <= bin_upper_bound:
                bin_width = bin_upper_bound - bin_lower_bound
                if bin_width > 0:
                    # assume uniform distribution within this bin and calculate selectivity
                    uniform_selectivity = 1.0 / (bin_width*total_bins)
                    selectivity = min(selectivity, uniform_selectivity)
                break    

    return selectivity


In [4]:
# test the selectivity estimation functions on a numeric column
attribute = 'd_year'

# test range selectivity estimation
value_range = (1992, 1994)
selectivity = estimate_selectivity_range(attribute, value_range, stats_dict, estimated_rows, data_type='numeric')
print(f"Estimated selectivity for range {value_range}: {selectivity}")

# test equality selectivity estimation
value = 1992
selectivity = estimate_selectivity_eq(attribute, value, stats_dict, data_type='numeric')
print(f"Estimated selectivity for value {value}: {selectivity}")

# now, let's try a char column
attribute = 'd_dayofweek'

# test range selectivity estimation
value_range = ('Monday', 'Wednesday')
selectivity = estimate_selectivity_range(attribute, value_range, stats_dict, estimated_rows, data_type='char')
print(f"Estimated selectivity for range {value_range}: {selectivity}")

# test equality selectivity estimation
value = 'Monday'
selectivity = estimate_selectivity_eq(attribute, value, stats_dict, data_type='char')
print(f"Estimated selectivity for value {value}: {selectivity}")




Estimated selectivity for range (1992, 1994): 0.42879498
Estimated selectivity for value 1992: 0.14319248
Estimated selectivity for range ('Monday', 'Wednesday'): 0.8571987299999999
Estimated selectivity for value Monday: 0.14280125
