#### Test postgres DB, collect table statistics

In [30]:
import psycopg2
import numpy as np
from pg_utils import *
import pickle


#### Connect to the postgres database

In [15]:
conn = create_connection(dbname='tpch10')
close_connection(conn)

### Fetch some data and print it out

In [20]:
def execute_query(conn, query, print_results=True):
    # Create a cursor object
    cur = conn.cursor()

    # Execute a query
    cur.execute(query)

    # Fetch and print the results
    rows = cur.fetchall()

    if print_results:
        for row in rows:
            print(row)

    # Close the cursor
    cur.close()

    return rows


query = """
        SELECT * 
        FROM customer
        LIMIT 10 
        """

conn = create_connection(dbname='tpch10')
execute_query(conn, query)
close_connection(conn)


(1, 'Customer#000000001', 'IVhzIApeRb ot,c,E', 15, '25-989-741-2988', Decimal('711.56'), 'BUILDING  ', 'to the even, regular platelets. regular, ironic epitaphs nag e')
(2, 'Customer#000000002', 'XSTf4,NCwDVaWNe6tEgvwfmRchLXak', 13, '23-768-687-3665', Decimal('121.65'), 'AUTOMOBILE', 'l accounts. blithely ironic theodolites integrate boldly: caref')
(3, 'Customer#000000003', 'MG9kdTD2WBHm', 1, '11-719-748-3364', Decimal('7498.12'), 'AUTOMOBILE', ' deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov')
(4, 'Customer#000000004', 'XxVSJsLAGtn', 4, '14-128-190-5944', Decimal('2866.83'), 'MACHINERY ', ' requests. final, regular ideas sleep final accou')
(5, 'Customer#000000005', 'KvpyuHCplrB84WgAiGV6sYpZq7Tj', 3, '13-750-942-6364', Decimal('794.47'), 'HOUSEHOLD ', 'n accounts will have to unwind. foxes cajole accor')
(6, 'Customer#000000006', 'sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn', 20, '30-114-968-4951', Decimal('7638.57'), 'AUTOMOBILE', 'tio

#### Gathering table statistics.

In [17]:
# Dictionary mapping table names to a list of tuples (column_name, data_type)
tpch_schema = {
    "customer": [
        ("c_custkey", "INT"),
        ("c_name", "VARCHAR(25)"),
        ("c_address", "VARCHAR(40)"),
        ("c_nationkey", "INT"),
        ("c_phone", "CHAR(15)"),
        ("c_acctbal", "DECIMAL(15, 2)"),
        ("c_mktsegment", "CHAR(10)"),
        ("c_comment", "VARCHAR(117)")
    ],
    "orders": [
        ("o_orderkey", "INT"),
        ("o_custkey", "INT"),
        ("o_orderstatus", "CHAR(1)"),
        ("o_totalprice", "DECIMAL(15, 2)"),
        ("o_orderdate", "DATE"),
        ("o_orderpriority", "CHAR(15)"),
        ("o_clerk", "CHAR(15)"),
        ("o_shippriority", "INT"),
        ("o_comment", "VARCHAR(79)")
    ],
    "lineitem": [
        ("l_orderkey", "INT"),
        ("l_partkey", "INT"),
        ("l_suppkey", "INT"),
        ("l_linenumber", "INT"),
        ("l_quantity", "DECIMAL(15, 2)"),
        ("l_extendedprice", "DECIMAL(15, 2)"),
        ("l_discount", "DECIMAL(15, 2)"),
        ("l_tax", "DECIMAL(15, 2)"),
        ("l_returnflag", "CHAR(1)"),
        ("l_linestatus", "CHAR(1)"),
        ("l_shipdate", "DATE"),
        ("l_commitdate", "DATE"),
        ("l_receiptdate", "DATE"),
        ("l_shipinstruct", "CHAR(25)"),
        ("l_shipmode", "CHAR(10)"),
        ("l_comment", "VARCHAR(44)")
    ],
    "part": [
        ("p_partkey", "INT"),
        ("p_name", "VARCHAR(55)"),
        ("p_mfgr", "CHAR(25)"),
        ("p_brand", "CHAR(10)"),
        ("p_type", "VARCHAR(25)"),
        ("p_size", "INT"),
        ("p_container", "CHAR(10)"),
        ("p_retailprice", "DECIMAL(15, 2)"),
        ("p_comment", "VARCHAR(23)")
    ],
    "supplier": [
        ("s_suppkey", "INT"),
        ("s_name", "CHAR(25)"),
        ("s_address", "VARCHAR(40)"),
        ("s_nationkey", "INT"),
        ("s_phone", "CHAR(15)"),
        ("s_acctbal", "DECIMAL(15, 2)"),
        ("s_comment", "VARCHAR(101)")
    ],
    "partsupp": [
        ("ps_partkey", "INT"),
        ("ps_suppkey", "INT"),
        ("ps_availqty", "INT"),
        ("ps_supplycost", "DECIMAL(15, 2)"),
        ("ps_comment", "VARCHAR(199)")
    ],
    "nation": [
        ("n_nationkey", "INT"),
        ("n_name", "CHAR(25)"),
        ("n_regionkey", "INT"),
        ("n_comment", "VARCHAR(152)")
    ],
    "region": [
        ("r_regionkey", "INT"),
        ("r_name", "CHAR(25)"),
        ("r_comment", "VARCHAR(152)")
    ]
}

# Example of accessing the dictionary
print(tpch_schema["customer"])


[('c_custkey', 'INT'), ('c_name', 'VARCHAR(25)'), ('c_address', 'VARCHAR(40)'), ('c_nationkey', 'INT'), ('c_phone', 'CHAR(15)'), ('c_acctbal', 'DECIMAL(15, 2)'), ('c_mktsegment', 'CHAR(10)'), ('c_comment', 'VARCHAR(117)')]


In [25]:
def collect_stats(table_name, columns):

    # Connect to the PostgreSQL database
    conn =create_connection(dbname='tpch10')
    
    print(f"Collecting stats...")
    print(f"Table --> {table_name}")

    # Create a cursor
    cursor = conn.cursor()

    # Prepare a dictionary to store statistics
    stats = {}

    # Iterate over each column and its data type
    for column_name, data_type in columns:
        # if columns name contains "comment" then skip
        if 'comment' in column_name:
            continue   
        
        print(f"Column --> {column_name}")

        if 'INT' in data_type or 'DECIMAL' in data_type or 'NUMERIC' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name}), COUNT(*), COUNT(DISTINCT {column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}

            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT {column_name}, COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY {column_name}
                ORDER BY {column_name};
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")

        elif 'CHAR' in data_type or 'VARCHAR' in data_type:
            query = f"""
            SELECT MIN(TRIM({column_name})), MAX(TRIM({column_name})), COUNT(*), COUNT(DISTINCT TRIM({column_name}))
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}
            
            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT TRIM({column_name}), COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY TRIM({column_name})
                ORDER BY TRIM({column_name});
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")
       
       
        elif 'DATE' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_date, max_date = cursor.fetchone()
            stats[column_name] = {
                'min': min_date,
                'max': max_date
            }

            print(f"Collected stats for {column_name}: min={min_date}, max={max_date}")


    # Close the database connection
    cursor.close()
    close_connection(conn)

    return stats


In [28]:
tpch_stats = {}

for table_name in tpch_schema.keys():
    table_stats = collect_stats(table_name, columns=tpch_schema[table_name])
    tpch_stats[table_name] = table_stats
    print(table_stats)
    print("\n\n")


table = 'supplier'
customer_stats = collect_stats(table, columns=tpch_schema[table])

print(customer_stats)

Collecting stats...
Table --> customer
Column --> c_custkey
Collected stats for c_custkey: min=1, max=1500000, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_name
Collected stats for c_name: min=Customer#000000001, max=Customer#001500000, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_address
Collected stats for c_address: min=,  RRl6F,PWq, max=zzzbtVPaB5eL7AFB07nVjHFMa51j2UMU, total_count=1500000, distinct_count=1500000, histogram=None
Column --> c_nationkey
Collected stats for c_nationkey: min=0, max=24, total_count=1500000, distinct_count=25, histogram=[(0, 59916), (1, 59841), (2, 59952), (3, 59849), (4, 59969), (5, 60471), (6, 60316), (7, 60153), (8, 60215), (9, 60236), (10, 60101), (11, 60056), (12, 59757), (13, 59909), (14, 59476), (15, 59834), (16, 59796), (17, 59788), (18, 60065), (19, 60048), (20, 59803), (21, 59997), (22, 60065), (23, 60381), (24, 60006)]
Column --> c_phone
Collected stats for c_phone: min=10-100-106-1617, m

In [31]:
# save ssb schema and ssb stats to pickle files

with open('tpch_schema.pkl', 'wb') as f:
    pickle.dump(tpch_schema, f)

with open('tpch10_stats.pkl', 'wb') as f:  
    pickle.dump(tpch_stats, f)    
    
