In [121]:
%pip install psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


In [122]:
import psycopg2
import logging
from collections import defaultdict
import re

In [123]:
try:
  conn = psycopg2.connect(
      dbname="tpc",
      user="postgres",
      password="password",
      host="localhost",
      port="5432"
  )
  logging.info("Database connection established.")
except Exception as e:
  print(f"An error occurred while connecting to the database: {e}")

In [124]:
def execute_query(conn, query):
  try:
    # Create a cursor object
    cur = conn.cursor()

    # Execute the query
    cur.execute(query)

    # Fetch all rows from the result set
    rows = cur.fetchall()
    return rows
  except psycopg2.Error as e:
    print("Error executing query:", e)

In [125]:
tableSet = {'lineitem', 'orders','customer','partsupp','supplier','part','nation','region'}
tableToProperties = defaultdict(lambda: {})

In [126]:
"""
    Refresh the file metadata as it is updated lazily.
"""

UPDATE_TABLE_STATS_QUERY = """
DO $$
DECLARE
    table_name TEXT;
BEGIN
    FOR table_name IN
        SELECT tablename
        FROM pg_catalog.pg_tables
        WHERE schemaname = 'public' -- Specify the schema if needed
    LOOP
        EXECUTE format('ANALYZE %I', table_name);
    END LOOP;
END $$;
"""
execute_query(conn, UPDATE_TABLE_STATS_QUERY)

Error executing query: no results to fetch


In [127]:
"""
  Get and store table metadata 
"""

RELATION_PROPERTIES_QUERY = """
SELECT relname, reltuples, relpages 
FROM pg_class 
WHERE relkind IN ('r');
"""
result = execute_query(conn, RELATION_PROPERTIES_QUERY)
for name, tuple_count, page_count in result:
  if name in tableSet:
    tableToProperties[name]['tuple_count'] = tuple_count
    tableToProperties[name]['page_count'] = page_count

print(tableToProperties)

defaultdict(<function <lambda> at 0x7f77e0cf4280>, {'region': {'tuple_count': 5.0, 'page_count': 1}, 'nation': {'tuple_count': 25.0, 'page_count': 1}, 'supplier': {'tuple_count': 10000.0, 'page_count': 223}, 'part': {'tuple_count': 200000.0, 'page_count': 4128}, 'partsupp': {'tuple_count': 800000.0, 'page_count': 17552}, 'customer': {'tuple_count': 150000.0, 'page_count': 3600}, 'orders': {'tuple_count': 1500000.0, 'page_count': 26136}, 'lineitem': {'tuple_count': 6002286.0, 'page_count': 112600}})


In [132]:
SEQ_PAGE_COST = 1
CPU_TUPLE_COST = 0.01
CPU_OPERATOR_COST = 0.0025

def scan_cost_function(seq_pages_accessed: int, rows: int) -> float:
  return (seq_pages_accessed * SEQ_PAGE_COST) + (rows * CPU_TUPLE_COST)

operator_to_cost_function = { 'Seq Scan': scan_cost_function }


In [133]:
"""
  Test Selection
"""


SELECT_PART_QUERY="""
SELECT p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment
FROM public.part;
"""

class Explainer:
  EXPLAIN_REGEX = r"(\w+\s+\w+)\s+on\s+(\w+)\s+\(cost=(\d+\.\d+)\.\.(\d+\.\d+) rows=(\d+) width=(\d+)\)"
  def __init__(self):
    self.result = {}

  def toExplainResult(self, explanation: str):
    match = re.match(self.EXPLAIN_REGEX, explanation)
    if match:
      self.result['operator'] = match.group(1)
      self.result['table_name'] = match.group(2)
      self.result['starting_cost'] = float(match.group(3))
      self.result['total_cost'] = float(match.group(4))
      self.result['rows'] = int(match.group(5))
      self.result['width'] = int(match.group(6))
    else:
      print(f"No match found for {explanation}")
  
  def exec(self, query: str):
    query_result = execute_query(conn, "EXPLAIN " + query)
    for res in query_result:
      self.toExplainResult(res[0])
    return self.result
  
  def explain(self):
    if self.result['operator'] not in operator_to_cost_function:
      raise Exception(f"Cost function not defined for {self.result['operator']}")
    
    tableName = self.result['table_name']
    operator = self.result['operator']
    pagesAccessed = tableToProperties[tableName]['page_count'] # change to use block/buffer wtv Alex said
    expected_cost = operator_to_cost_function[operator](pagesAccessed, self.result['rows'])
    print(f"Actual total cost: {self.result['total_cost']}; Expected total cost: {expected_cost}")

select_part_explainer = Explainer()
select_part_explainer.exec(SELECT_PART_QUERY)
select_part_explainer.explain()


Actual total cost: 6128.0; Expected total cost: 6128.0
