#### Test postgres DB, collect table statistics

In [1]:
import psycopg2
import numpy as np
from pg_utils import *

#### Connect to the postgres database

In [2]:
conn = create_connection()
close_connection(conn)

### Fetch some data and print it out

In [3]:
def execute_query(conn, query, print_results=True):
    # Create a cursor object
    cur = conn.cursor()

    # Execute a query
    cur.execute(query)

    # Fetch and print the results
    rows = cur.fetchall()

    if print_results:
        for row in rows:
            print(row)

    # Close the cursor
    cur.close()

    return rows


query = """
        SELECT * 
        FROM lineorder
        LIMIT 10 
        """

conn = create_connection()
execute_query(conn, query)
close_connection(conn)


(1, 1, 257368, 465569, 8273, datetime.date(1995, 2, 18), '2-HIGH', '0', 17, Decimal('2608718'), Decimal('9783671'), Decimal('4'), Decimal('2504369'), Decimal('92072'), Decimal('2'), datetime.date(1995, 3, 31), 'TRUCK')
(1, 2, 257368, 201928, 1630, datetime.date(1995, 2, 18), '2-HIGH', '0', 36, Decimal('6587676'), Decimal('9783671'), Decimal('9'), Decimal('5994785'), Decimal('109794'), Decimal('6'), datetime.date(1995, 4, 16), 'MAIL')
(1, 3, 257368, 191100, 709, datetime.date(1995, 2, 18), '2-HIGH', '0', 8, Decimal('952880'), Decimal('9783671'), Decimal('10'), Decimal('857592'), Decimal('71466'), Decimal('2'), datetime.date(1995, 4, 22), 'REG AIR')
(2, 1, 182365, 318510, 10657, datetime.date(1996, 1, 30), '2-HIGH', '0', 38, Decimal('5808300'), Decimal('22125006'), Decimal('0'), Decimal('5808300'), Decimal('91710'), Decimal('5'), datetime.date(1996, 3, 14), 'RAIL')
(2, 2, 182365, 583525, 6015, datetime.date(1996, 1, 30), '2-HIGH', '0', 30, Decimal('4825500'), Decimal('22125006'), Decimal

#### Gathering table statistics.

In [4]:
# Dictionary mapping table names to a list of tuples (column_name, data_type)
ssb_schema = {
    "lineorder": [
        ("lo_orderkey", "INT"),
        ("lo_linenumber", "INT"),
        ("lo_custkey", "INT"),
        ("lo_partkey", "INT"),
        ("lo_suppkey", "INT"),
        ("lo_orderdate", "DATE"),
        ("lo_orderpriority", "VARCHAR(15)"),
        ("lo_shippriority", "CHAR(1)"),
        ("lo_quantity", "INT"),
        ("lo_extendedprice", "NUMERIC"),
        ("lo_ordtotalprice", "NUMERIC"),
        ("lo_discount", "NUMERIC"),
        ("lo_revenue", "NUMERIC"),
        ("lo_supplycost", "NUMERIC"),
        ("lo_tax", "INT"),
        ("lo_commitdate", "DATE"),
        ("lo_shipmode", "VARCHAR(7)")
    ],
    "part": [
        ("p_partkey", "INT"),
        ("p_name", "VARCHAR(22)"),
        ("p_mfgr", "CHAR(6)"),
        ("p_category", "CHAR(7)"),
        ("p_brand", "CHAR(9)"),
        ("p_color", "VARCHAR(11)"),
        ("p_type", "VARCHAR(25)"),
        ("p_size", "INT"),
        ("p_container", "VARCHAR(10)")
    ],
    "supplier": [
        ("s_suppkey", "INT"),
        ("s_name", "CHAR(25)"),
        ("s_address", "VARCHAR(25)"),
        ("s_city", "CHAR(10)"),
        ("s_nation", "VARCHAR(15)"),
        ("s_region", "VARCHAR(12)"),
        ("s_phone", "CHAR(15)")
    ],
    "customer": [
        ("c_custkey", "INT"),
        ("c_name", "CHAR(18)"),
        ("c_address", "VARCHAR(25)"),
        ("c_city", "CHAR(10)"),
        ("c_nation", "VARCHAR(15)"),
        ("c_region", "VARCHAR(12)"),
        ("c_phone", "CHAR(15)"),
        ("c_mktsegment", "VARCHAR(10)")
    ],
    "dwdate": [
        ("d_datekey", "DATE"),
        ("d_date", "VARCHAR(18)"),
        ("d_dayofweek", "VARCHAR(9)"),
        ("d_month", "VARCHAR(9)"),
        ("d_year", "INT"),
        ("d_yearmonthnum", "INT"),
        ("d_yearmonth", "CHAR(7)"),
        ("d_daynuminweek", "INT"),
        ("d_daynuminmonth", "INT"),
        ("d_daynuminyear", "INT"),
        ("d_monthnuminyear", "INT"),
        ("d_weeknuminyear", "INT"),
        ("d_sellingseason", "VARCHAR(9)"),
        ("d_lastdayinweekfl", "BIT"),
        ("d_lastdayinmonthfl", "BIT"),
        ("d_holidayfl", "BIT"),
        ("d_weekdayfl", "BIT")
    ]
}

# Example usage: print the schema for the 'lineorder' table
print(ssb_schema["lineorder"])


[('lo_orderkey', 'INT'), ('lo_linenumber', 'INT'), ('lo_custkey', 'INT'), ('lo_partkey', 'INT'), ('lo_suppkey', 'INT'), ('lo_orderdate', 'DATE'), ('lo_orderpriority', 'VARCHAR(15)'), ('lo_shippriority', 'CHAR(1)'), ('lo_quantity', 'INT'), ('lo_extendedprice', 'NUMERIC'), ('lo_ordtotalprice', 'NUMERIC'), ('lo_discount', 'NUMERIC'), ('lo_revenue', 'NUMERIC'), ('lo_supplycost', 'NUMERIC'), ('lo_tax', 'INT'), ('lo_commitdate', 'DATE'), ('lo_shipmode', 'VARCHAR(7)')]


In [12]:
def collect_stats(table_name, columns):

    # Connect to the PostgreSQL database
    conn =create_connection()
    
    print(f"Collecting stats...")
    print(f"Table --> {table_name}")

    # Create a cursor
    cursor = conn.cursor()

    # Prepare a dictionary to store statistics
    stats = {}

    # Iterate over each column and its data type
    for column_name, data_type in columns:
        print(f"Column --> {column_name}")

        if 'INT' in data_type or 'DECIMAL' in data_type or 'NUMERIC' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name}), COUNT(*), COUNT(DISTINCT {column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}

            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT {column_name}, COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY {column_name}
                ORDER BY {column_name};
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")

        
        elif 'CHAR' in data_type or 'VARCHAR' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name}), COUNT(*), COUNT(DISTINCT {column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_val, max_val, total_count, distinct_count = cursor.fetchone()
            stats[column_name] = {
                'min': min_val,
                'max': max_val,
                'total_count': total_count,
                'distinct_count': distinct_count}
            
            if distinct_count <= 1000:
                histogram_query = f"""
                SELECT {column_name}, COUNT(*) AS frequency
                FROM {table_name}
                GROUP BY {column_name}
                ORDER BY {column_name};
                """
                cursor.execute(histogram_query)
                histogram = cursor.fetchall()
                stats[column_name]['histogram'] = {value: count for value, count in histogram}
                
            else:
                histogram = None
                stats[column_name]['histogram'] = None

            print(f"Collected stats for {column_name}: min={min_val}, max={max_val}, total_count={total_count}, distinct_count={distinct_count}, histogram={histogram}")
       
        elif 'DATE' in data_type:
            query = f"""
            SELECT MIN({column_name}), MAX({column_name})
            FROM {table_name};
            """
            cursor.execute(query)
            min_date, max_date = cursor.fetchone()
            stats[column_name] = {
                'min': min_date,
                'max': max_date
            }

            print(f"Collected stats for {column_name}: min={min_date}, max={max_date}")


    # Close the database connection
    cursor.close()
    close_connection(conn)

    return stats


In [6]:
table = 'customer'
customer_stats = collect_stats(table, columns=ssb_schema[table])

print(customer_stats)

Collecting stats...
Table --> customer
Column --> c_custkey
Collected stats for c_custkey: min=1, max=300000, total_count=300000, distinct_count=300000, histogram=None
Column --> c_name
Collected stats for c_name: min=Customer#000000001, max=Customer#000300000, total_count=300000, distinct_count=300000, histogram=None
Column --> c_address
Collected stats for c_address: min=   2Ksa,h, max=zzyQcZpC50YD, total_count=300000, distinct_count=300000, histogram=None
Column --> c_city
Collected stats for c_city: min=ALGERIA  0, max=VIETNAM  9, total_count=300000, distinct_count=250, histogram=[('ALGERIA  0', 1161), ('ALGERIA  1', 1181), ('ALGERIA  2', 1173), ('ALGERIA  3', 1225), ('ALGERIA  4', 1191), ('ALGERIA  5', 1208), ('ALGERIA  6', 1176), ('ALGERIA  7', 1187), ('ALGERIA  8', 1154), ('ALGERIA  9', 1186), ('ARGENTINA0', 1229), ('ARGENTINA1', 1155), ('ARGENTINA2', 1218), ('ARGENTINA3', 1203), ('ARGENTINA4', 1166), ('ARGENTINA5', 1205), ('ARGENTINA6', 1166), ('ARGENTINA7', 1190), ('ARGENTINA8

In [13]:
table = 'lineorder'
lineorder_stats = collect_stats(table, columns=ssb_schema[table])

print(lineorder_stats)

Collecting stats...
Table --> lineorder
Column --> lo_orderkey
Collected stats for lo_orderkey: min=1, max=60000000, total_count=59986214, distinct_count=15000000, histogram=None
Column --> lo_linenumber
Collected stats for lo_linenumber: min=1, max=7, total_count=59986214, distinct_count=7, histogram=[(1, 15000000), (2, 12856079), (3, 10710991), (4, 8569709), (5, 6425628), (6, 4283621), (7, 2140186)]
Column --> lo_custkey
Collected stats for lo_custkey: min=1, max=299999, total_count=59986214, distinct_count=200000, histogram=None
Column --> lo_partkey
Collected stats for lo_partkey: min=1, max=600000, total_count=59986214, distinct_count=600000, histogram=None
Column --> lo_suppkey
Collected stats for lo_suppkey: min=1, max=20000, total_count=59986214, distinct_count=20000, histogram=None
Column --> lo_orderdate
Collected stats for lo_orderdate: min=1992-01-01, max=1998-08-02
Column --> lo_orderpriority
Collected stats for lo_orderpriority: min=1-URGENT, max=5-LOW, total_count=599862

In [8]:
table = 'part'
part_stats = collect_stats(table, columns=ssb_schema[table])

print(part_stats)

Collecting stats...
Table --> part
Column --> p_partkey
Collected stats for p_partkey: min=1, max=800000, total_count=800000, distinct_count=800000, histogram=None
Column --> p_name
Collected stats for p_name: min=almond antique, max=yellow white, total_count=800000, distinct_count=8372, histogram=None
Column --> p_mfgr
Collected stats for p_mfgr: min=MFGR#1, max=MFGR#5, total_count=800000, distinct_count=5, histogram=[('MFGR#1', 160027), ('MFGR#2', 159744), ('MFGR#3', 160085), ('MFGR#4', 159933), ('MFGR#5', 160211)]
Column --> p_category
Collected stats for p_category: min=MFGR#11, max=MFGR#55, total_count=800000, distinct_count=25, histogram=[('MFGR#11', 31934), ('MFGR#12', 31882), ('MFGR#13', 32100), ('MFGR#14', 31901), ('MFGR#15', 32210), ('MFGR#21', 31757), ('MFGR#22', 31939), ('MFGR#23', 32265), ('MFGR#24', 32029), ('MFGR#25', 31754), ('MFGR#31', 31908), ('MFGR#32', 32065), ('MFGR#33', 31934), ('MFGR#34', 31984), ('MFGR#35', 32194), ('MFGR#41', 32323), ('MFGR#42', 31967), ('MFGR#

In [9]:
table = 'supplier'
supplier_stats = collect_stats(table, columns=ssb_schema[table])

print(supplier_stats)

Collecting stats...
Table --> supplier
Column --> s_suppkey
Collected stats for s_suppkey: min=1, max=20000, total_count=20000, distinct_count=20000, histogram=None
Column --> s_name
Collected stats for s_name: min=Supplier#000000001       , max=Supplier#000020000       , total_count=20000, distinct_count=20000, histogram=None
Column --> s_address
Collected stats for s_address: min=  3TLxhVMg, max=zzl8o8MHkQAs, total_count=20000, distinct_count=20000, histogram=None
Column --> s_city
Collected stats for s_city: min=ALGERIA  0, max=VIETNAM  9, total_count=20000, distinct_count=250, histogram=[('ALGERIA  0', 70), ('ALGERIA  1', 91), ('ALGERIA  2', 86), ('ALGERIA  3', 67), ('ALGERIA  4', 68), ('ALGERIA  5', 98), ('ALGERIA  6', 80), ('ALGERIA  7', 66), ('ALGERIA  8', 71), ('ALGERIA  9', 90), ('ARGENTINA0', 87), ('ARGENTINA1', 92), ('ARGENTINA2', 84), ('ARGENTINA3', 79), ('ARGENTINA4', 76), ('ARGENTINA5', 86), ('ARGENTINA6', 77), ('ARGENTINA7', 102), ('ARGENTINA8', 76), ('ARGENTINA9', 89), 

In [10]:
table = 'dwdate'
dwdate_stats = collect_stats(table, columns=ssb_schema[table])

print(dwdate_stats)

Collecting stats...
Table --> dwdate
Column --> d_datekey
Collected stats for d_datekey: min=1992-01-01, max=1998-12-30
Column --> d_date
Collected stats for d_date: min=April 1, 1992, max=September 9, 1998, total_count=2556, distinct_count=2556, histogram=None
Column --> d_dayofweek
Collected stats for d_dayofweek: min=Friday, max=Wednesday, total_count=2556, distinct_count=7, histogram=[('Friday', 365), ('Monday', 365), ('Saturday', 365), ('Sunday', 365), ('Thursday', 366), ('Tuesday', 365), ('Wednesday', 365)]
Column --> d_month
Collected stats for d_month: min=April, max=September, total_count=2556, distinct_count=12, histogram=[('April', 210), ('Augest', 217), ('December', 216), ('February', 198), ('January', 217), ('July', 217), ('June', 210), ('March', 217), ('May', 217), ('November', 210), ('Octorber', 217), ('September', 210)]
Column --> d_year
Collected stats for d_year: min=1992, max=1998, total_count=2556, distinct_count=7, histogram=[(1992, 366), (1993, 365), (1994, 365), 

In [14]:

# create an ssb stats dictionary
ssb_stats = {
    "customer": customer_stats,
    "lineorder": lineorder_stats,
    "part": part_stats,
    "supplier": supplier_stats,
    "dwdate": dwdate_stats
}

# save ssb schema and ssb stats to pickle files
import pickle

with open('ssb_schema.pkl', 'wb') as f:
    pickle.dump(ssb_schema, f)

with open('ssb_stats.pkl', 'wb') as f:  
    pickle.dump(ssb_stats, f)    
    
