#### Test postgres DB, collect table statistics

In [1]:
import psycopg2
import numpy as np
from pg_utils import *
import pickle


#### Connect to the postgres database

In [2]:
dbname = 'tpch10_skew'

In [3]:
conn = create_connection(dbname=dbname)
close_connection(conn)

### Fetch some data and print it out

In [4]:
def execute_query(conn, query, print_results=True):
    # Create a cursor object
    cur = conn.cursor()

    # Execute a query
    cur.execute(query)

    # Fetch and print the results
    rows = cur.fetchall()

    if print_results:
        for row in rows:
            print(row)

    # Close the cursor
    cur.close()

    return rows


query = """
        SELECT * 
        FROM customer
        LIMIT 10 
        """

conn = create_connection(dbname=dbname)
execute_query(conn, query)
close_connection(conn)


(1, 'Customer#000000001', 'j5BkijBM1PkCy0O1m', 15, '25-989-989-0989', Decimal('711.56'), 'BUILDING  ', 'AN0PL zMkz5kz6N64kM wnAxz6MMgx5B6Nl1ljnM4i QmA4ixB nMgB11jQRi61z70 76NlmCMNPg1yhQjR7myP0')
(2, 'Customer#000000002', 'z7wz775S7P5k5Clzm', 15, '25-989-989-0989', Decimal('711.56'), 'BUILDING  ', 'yzM547O2hQQAmzC0mzMxQCnAxNkAPQ40Ly RlM4ng7gB6y0gRP7h OwL3M6CCSxySRk4hym nP5lhRMO0wP7Q75M')
(3, 'Customer#000000003', 'QR2zl407LO15gnn6Q', 15, '25-989-989-0989', Decimal('711.56'), 'AUTOMOBILE', 'C06n3gk4zlwnLgN24yB61SgOA2QwgM 2xlnnPlAxNQ25l14x213wnAkBB64C1ml7PROh40lCjSPyLRSnzyjjjB5m')
(4, 'Customer#000000004', 'NymCwwLx1OC73QzSk', 15, '25-989-989-0989', Decimal('711.56'), 'AUTOMOBILE', '3S4xklC4LiQlzh RjCLNOBBgl74L2wj1mN3LPzli3iPNjBSg0R707CSn5QAlLSyAnx6nw4nNQw OA2z5333j6lmA')
(5, 'Customer#000000005', 'yzwwR1BS537kRzN0n', 15, '25-989-989-0989', Decimal('711.56'), 'MACHINERY ', 'C367xn3nSN2RxkRRM  1j Rz44Lm7gNjl0Pj7g1h2NhPAyMNShgxO4P1nh3Lnyn4zzx310QR1550w6zz4xOg4BCg')
(6, 'Customer#000000006',

#### Gathering table statistics.

In [5]:
# Dictionary mapping table names to a list of tuples (column_name, data_type)
tpch_schema = {
    "customer": [
        ("c_custkey", "INT"),
        ("c_name", "VARCHAR(25)"),
        ("c_address", "VARCHAR(40)"),
        ("c_nationkey", "INT"),
        ("c_phone", "CHAR(15)"),
        ("c_acctbal", "DECIMAL(15, 2)"),
        ("c_mktsegment", "CHAR(10)"),
        ("c_comment", "VARCHAR(117)")
    ],
    "orders": [
        ("o_orderkey", "INT"),
        ("o_custkey", "INT"),
        ("o_orderstatus", "CHAR(1)"),
        ("o_totalprice", "DECIMAL(15, 2)"),
        ("o_orderdate", "DATE"),
        ("o_orderpriority", "CHAR(15)"),
        ("o_clerk", "CHAR(15)"),
        ("o_shippriority", "INT"),
        ("o_comment", "VARCHAR(79)")
    ],
    "lineitem": [
        ("l_orderkey", "INT"),
        ("l_partkey", "INT"),
        ("l_suppkey", "INT"),
        ("l_linenumber", "INT"),
        ("l_quantity", "DECIMAL(15, 2)"),
        ("l_extendedprice", "DECIMAL(15, 2)"),
        ("l_discount", "DECIMAL(15, 2)"),
        ("l_tax", "DECIMAL(15, 2)"),
        ("l_returnflag", "CHAR(1)"),
        ("l_linestatus", "CHAR(1)"),
        ("l_shipdate", "DATE"),
        ("l_commitdate", "DATE"),
        ("l_receiptdate", "DATE"),
        ("l_shipinstruct", "CHAR(25)"),
        ("l_shipmode", "CHAR(10)"),
        ("l_comment", "VARCHAR(44)")
    ],
    "part": [
        ("p_partkey", "INT"),
        ("p_name", "VARCHAR(55)"),
        ("p_mfgr", "CHAR(25)"),
        ("p_brand", "CHAR(10)"),
        ("p_type", "VARCHAR(25)"),
        ("p_size", "INT"),
        ("p_container", "CHAR(10)"),
        ("p_retailprice", "DECIMAL(15, 2)"),
        ("p_comment", "VARCHAR(23)")
    ],
    "supplier": [
        ("s_suppkey", "INT"),
        ("s_name", "CHAR(25)"),
        ("s_address", "VARCHAR(40)"),
        ("s_nationkey", "INT"),
        ("s_phone", "CHAR(15)"),
        ("s_acctbal", "DECIMAL(15, 2)"),
        ("s_comment", "VARCHAR(101)")
    ],
    "partsupp": [
        ("ps_partkey", "INT"),
        ("ps_suppkey", "INT"),
        ("ps_availqty", "INT"),
        ("ps_supplycost", "DECIMAL(15, 2)"),
        ("ps_comment", "VARCHAR(199)")
    ],
    "nation": [
        ("n_nationkey", "INT"),
        ("n_name", "CHAR(25)"),
        ("n_regionkey", "INT"),
        ("n_comment", "VARCHAR(152)")
    ],
    "region": [
        ("r_regionkey", "INT"),
        ("r_name", "CHAR(25)"),
        ("r_comment", "VARCHAR(152)")
    ]
}

# Example of accessing the dictionary
print(tpch_schema["customer"])


[('c_custkey', 'INT'), ('c_name', 'VARCHAR(25)'), ('c_address', 'VARCHAR(40)'), ('c_nationkey', 'INT'), ('c_phone', 'CHAR(15)'), ('c_acctbal', 'DECIMAL(15, 2)'), ('c_mktsegment', 'CHAR(10)'), ('c_comment', 'VARCHAR(117)')]


In [6]:
def collect_stats(table_name, columns):

    # Connect to the PostgreSQL database
    conn = create_connection(dbname=dbname)
    
    print(f"Collecting stats...")
    print(f"Table --> {table_name}")

    # Create a cursor
    cursor = conn.cursor()

    # Prepare a dictionary to store statistics
    stats = {}

    # Iterate over each column and its data type
    for column_name, data_type in columns:
        # if columns name contains "comment" then skip
        if 'comment' in column_name:
            continue   
        
        print(f"Column --> {column_name}")

        if 'INT' in data_type or 'DECIMAL' in data_type or 'NUMERIC' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name}), COUNT(*), COUNT(DISTINCT {column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}

            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT {column_name}, COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY {column_name}
                ORDER BY {column_name};
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")

        elif 'CHAR' in data_type or 'VARCHAR' in data_type:
            query = f"""
            SELECT MIN(TRIM({column_name})), MAX(TRIM({column_name})), COUNT(*), COUNT(DISTINCT TRIM({column_name}))
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}
            
            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT TRIM({column_name}), COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY TRIM({column_name})
                ORDER BY TRIM({column_name});
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")
       
       
        elif 'DATE' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_date, max_date = cursor.fetchone()
            stats[column_name] = {
                'min': min_date,
                'max': max_date
            }

            print(f"Collected stats for {column_name}: min={min_date}, max={max_date}")


    # Close the database connection
    cursor.close()
    close_connection(conn)

    return stats


In [7]:
tpch_skew_stats = {}

for table_name in tpch_schema.keys():
    table_stats = collect_stats(table_name, columns=tpch_schema[table_name])
    tpch_skew_stats[table_name] = table_stats
    print(table_stats)
    print("\n\n")


Collecting stats...
Table --> customer
Column --> c_custkey
Collected stats for c_custkey: min=1, max=1500000, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_name
Collected stats for c_name: min=Customer#000000001, max=Customer#001500000, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_address
Collected stats for c_address: min=0    C73nLQ17 xiS737Sx0RM, max=zzzy6024nhLxmMlNl, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_nationkey
Collected stats for c_nationkey: min=0, max=24, total_count=1500000, distinct_count=25, histogram=[(0, 47704), (1, 86742), (2, 53076), (3, 84877), (4, 70086), (5, 53448), (6, 44748), (7, 44405), (8, 61934), (9, 50617), (10, 57095), (11, 42172), (12, 51829), (13, 103247), (14, 36509), (15, 153124), (16, 46098), (17, 58485), (18, 61718), (19, 44427), (20, 56483), (21, 44522), (22, 51881), (23, 58816), (24, 35957)]
Column --> c_phone
Collected stats for c_phone: min=10-100-129-1604, m

In [8]:
# save ssb schema and ssb stats to pickle files

with open('tpch_schema.pkl', 'wb') as f:
    pickle.dump(tpch_schema, f)

with open('tpch10_skew_stats.pkl', 'wb') as f:  
    pickle.dump(tpch_skew_stats, f)    
    
