### Selectivity estimation using Postgres Internal Statistics:


In [1]:
# auto reload all modules
%load_ext autoreload
%autoreload 2

from simple_cost_model import *
from ssb_qgen_class import *
import time
import pickle
import numpy as np
import hashlib


In [2]:
# set up query generator
qgen = QGEN()

# Get the statistics for all tables in the SSB database
table_names = ["customer", "dwdate", "lineorder", "part", "supplier"]
pg_stats = {}
estimated_rows = {}
for table_name in table_names:
    pg_stats[table_name], estimated_rows[table_name] = get_table_stats(table_name)

table_attributes = {}
for table_name in table_names:
    table_attributes[table_name] = list(pg_stats[table_name].keys())

ssb_tables, pk_columns = get_ssb_schema()
# create a dictionary and specify whether each attribute in each table is numeric or char
data_type_dict = {}
for table_name in ["customer", "dwdate", "lineorder", "part", "supplier"]:
    for column_name, column_type in ssb_tables[table_name]:
        if ("INT" in column_type) or ("DECIMAL" in column_type) or ("BIT" in column_type):
            data_type_dict[column_name] = "numeric"
        else:
            data_type_dict[column_name] = "char"    

#### Selectivity Estimator

In [6]:
selectivity_estimator = SelectivityEstimatorStats(data_type_dict)

In [10]:
# generate example query
example_query = qgen.generate_query(1)
# extract the predicates from the query
predicate_dict = example_query.predicate_dict

print(f"Predicates:")
for table_name, predicates in predicate_dict.items():
    print(f"  Table: {table_name}")
    for predicate in predicates:
        print(f"    {predicate}")
        

Predicates:
  Table: lineorder
    {'column': 'lo_orderdate', 'operator': 'eq', 'value': 'd_datekey', 'join': True}
    {'column': 'lo_discount', 'operator': 'range', 'value': (5, 7), 'join': False}
    {'column': 'lo_quantity', 'operator': '<', 'value': 25, 'join': False}
  Table: dwdate
    {'column': 'd_year', 'operator': 'eq', 'value': 1997, 'join': False}


In [11]:
# test out the selectivity estimation functions using the example query predicates
total_rows = estimated_rows["lineorder"]
for table_name, predicates in predicate_dict.items():
    print(f"Table: {table_name}\n")
    for predicate in predicates:
        if predicate['join'] == False:
            attribute = predicate['column']
            operator = predicate['operator']
            value = predicate['value']
            selectivity = selectivity_estimator.estimate_selectivity(attribute, operator, value, pg_stats[table_name], total_rows)
            print(f"  Predicate: {predicate}")
            print(f"  Selectivity: {selectivity}\n")


Table: lineorder

  Predicate: {'column': 'lo_discount', 'operator': 'range', 'value': (5, 7), 'join': False}
  Selectivity: 0.273233336

  Predicate: {'column': 'lo_quantity', 'operator': '<', 'value': 25, 'join': False}
  Selectivity: 0.4762666669999999

Table: dwdate

  Predicate: {'column': 'd_year', 'operator': 'eq', 'value': 1997, 'join': False}
  Selectivity: 0.14280125

