In [9]:
from pathlib import Path
import duckdb
import pandas as pd
from project_config import DUCKDB_FILE


def describe_duckdb_table(db_path: str | Path, table_name: str) -> pd.DataFrame:
    con = duckdb.connect(database=db_path, read_only=True)

    columns_df = con.execute(f"PRAGMA table_info('{table_name}')").fetchdf()
    columns = columns_df["name"].tolist()

    summary_rows = []

    for col in columns:
        row = con.execute(
            f"""
            SELECT
                '{col}' AS column,
                COUNT("{col}") AS non_null,
                COUNT(*) - COUNT("{col}") AS nulls,
                MIN(LENGTH(CAST("{col}" AS VARCHAR))) AS min_len,
                MAX(LENGTH(CAST("{col}" AS VARCHAR))) AS max_len,
                AVG(LENGTH(CAST("{col}" AS VARCHAR))) AS avg_len
            FROM {table_name}
        """
        ).fetchone()
        summary_rows.append(row)

    summary_df = pd.DataFrame(
        summary_rows,
        columns=["column", "non_null", "nulls", "min_len", "max_len", "avg_len"],
    )
    con.close()
    return summary_df


# Example usage:
summary = describe_duckdb_table(DUCKDB_FILE, "job_urls")
print(summary)

        column  non_null  nulls  min_len  max_len     avg_len
0          url        19      0       47      241  122.684211
1      company        19      0        3       18    9.578947
2    job_title        19      0       19       71   40.684211
3  source_file        19      0       67       67   67.000000
4        stage        19      0       13       13   13.000000
5    timestamp        19      0       26       26   26.000000


In [22]:
from pathlib import Path
import duckdb
from project_config import DUCKDB_FILE as db_path


def describe_duckdb_table(db_path: str | Path, table_name: str) -> pd.DataFrame:
    con = duckdb.connect(database=db_path, read_only=True)

    columns_df = con.execute(f"PRAGMA table_info('{table_name}')").fetchdf()
    columns = columns_df["name"].tolist()

    summary_rows = []

    for col in columns:
        row = con.execute(
            f"""
            SELECT
                '{col}' AS column,
                COUNT("{col}") AS non_null,
                COUNT(*) - COUNT("{col}") AS nulls,
                MIN(LENGTH(CAST("{col}" AS VARCHAR))) AS min_len,
                MAX(LENGTH(CAST("{col}" AS VARCHAR))) AS max_len,
                AVG(LENGTH(CAST("{col}" AS VARCHAR))) AS avg_len
            FROM {table_name}
        """
        ).fetchone()
        summary_rows.append(row)

    summary_df = pd.DataFrame(
        summary_rows,
        columns=["column", "non_null", "nulls", "min_len", "max_len", "avg_len"],
    )
    con.close()
    return summary_df


def print_columns(
    db_path: Path | str, table_name: str, columns: list[str], limit: int = 10
):
    con = duckdb.connect(str(db_path))

    col_str = ", ".join(columns)
    query = f"SELECT {col_str} FROM {table_name} LIMIT {limit}"

    df = con.execute(query).fetchdf()
    print(df)

    con.close()


summary = describe_duckdb_table(db_path, "job_postings")
print(summary)

print_columns(db_path, "job_postings", ["url", "content"])

         column  non_null  nulls  min_len  max_len      avg_len
0           url        38      0       44      377   145.631579
1        status        38      0        7        7     7.000000
2       message        30      8       37       37    37.000000
3     job_title        38      0       18       71    39.973684
4       company        38      0        3       25     9.921053
5      location        27     11       10      693    74.592593
6   salary_info        25     13       20      323    81.840000
7   posted_date         9     29        0       19     8.777778
8       content        38      0      229     9045  3754.789474
9   source_file        38      0       61       61    61.000000
10        stage        38      0       13       13    13.000000
11    timestamp        38      0       26       26    26.000000
                                                 url  \
0  https://www.google.com/about/careers/applicati...   
1  https://www.capitalonecareers.com/job/-/-/234/...   
