In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"
# pip install dwutils@git+ssh://git@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git@latest

In [43]:
import logging

# import connectorx as cx
from pandas import DataFrame
import pandas as pd
from typing import Optional
from sqlglot import parse_one

from sqlalchemy import select
from sqlalchemy.dialects import postgresql

import cmf
from cmf.helpers import selector
from cmf.data.utils import sqa_profiled
from cmf.helpers.selector import _parent_to_tree, _tree_to_reachable_stmt, _reachable_to_parent_data_stmt, _selector_to_data
from cmf.data import ENGINE

def create_cmf_pipelines_logger() -> logging.Logger:
    pipeline_logger = logging.getLogger("cmf_pipelines")
    logic_logger = logging.getLogger("cmf_logic")

    pipeline_logger.setLevel(logging.INFO)
    logic_logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "[%(asctime)s: %(levelname)s] %(name)s %(module)s: %(message)s"
    )
    handler.setFormatter(formatter)

    pipeline_logger.addHandler(handler)
    logic_logger.addHandler(handler)

    return pipeline_logger

logger = create_cmf_pipelines_logger()

# Speeding up queries

Everything is slower than I thought. Let's profile it.

When I compiled and ran it was faster. Now when I run in SQLAlchemy it's faster. All I can conclude is that overall database load is what's screwing with this.

In [44]:
%time

# SAMPLE = 10_000
_SOURCE_L = "naive_data_hub_v1"

dh_selector = selector(
    table="dbt.data_hub__companies",
    fields=["name", "company_number", "address_postcode"],
)

with sqa_profiled():
    dh_raw = cmf.query(
        selector=dh_selector, return_type="pandas", model=_SOURCE_L#, limit=SAMPLE
    )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


NameError: name 'ew_selector' is not defined

In [10]:
dh_raw.head(3)

Unnamed: 0,cluster_sha1,data_sha1,dbt_data_hub__companies_name,dbt_data_hub__companies_company_number,dbt_data_hub__companies_address_postcode
0,b'\x00\x00\x8a\x9c\x95\xa8\xf2\xd4c=:pa\x86\xa...,b'\x00\x00\x8a\x9c\x95\xa8\xf2\xd4c=:pa\x86\xa...,COLOUR TONE MASTERBATCH LIMITED,3176153.0,CF83 8YE
1,"b'\x00\x05\xce\xde\r\xf1\xa4""$\x94b6\xd7g\xe0\...","b'\x00\x05\xce\xde\r\xf1\xa4""$\x94b6\xd7g\xe0\...",Fourays Fashion,,NG7 5GQ
2,"b'\x00\x07\x93\xb9,\xcf\xcd\xe1h\xad\x8f\x88!\...","b'\x00\x07\x93\xb9,\xcf\xcd\xe1h\xad\x8f\x88!\...",JFL BROACHES & BROACHING LIMITED,6917128.0,MK17 8UR


In [None]:
dh_raw = cmf.query(
    selector=dh_selector, return_type="pandas", model=_SOURCE_L#, limit=SAMPLE
)

In [6]:
%time

SAMPLE = 10_000
_SOURCE_L = "naive_export_wins_v1"

ew_selector = selector(
    table="dbt.export_wins__wins_dataset",
    fields=["company_name", "cdms_reference"],
)

# with sqa_profiled():
#     ew_raw = cmf.query(
#         selector=ew_selector, return_type="sqlalchemy", model=_SOURCE_L, limit=SAMPLE
#     )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


In [12]:
type(ew_raw)

sqlalchemy.engine.result.ChunkedIteratorResult

# Export wins

Takes ages here, but runs VERY fast in PG directly.

In [4]:
model = "naive_export_wins_v1"
ew_selector = selector(
    table="dbt.export_wins__wins_dataset",
    fields=["company_name", "cdms_reference"],
)

# We want raw data with clusters attached
parent, child = _parent_to_tree(model, engine=ENGINE)
if len(parent) == 0:
    raise ValueError(f"Model {model} not found")
tree = [parent] + child
reachable_stmt = _tree_to_reachable_stmt(tree)
lookup_stmt = _reachable_to_parent_data_stmt(reachable_stmt, parent)
data_stmt = _selector_to_data(ew_selector, engine=ENGINE).cte()

final_stmt = select(lookup_stmt.c.parent.label("cluster_sha1"), data_stmt).join(
    lookup_stmt, lookup_stmt.c.child == data_stmt.c.data_sha1
)

In [6]:
with ENGINE.connect() as conn:
    cursor = conn.connection.cursor()
    compiled = final_stmt.compile(
        dialect=postgresql.dialect(),
        compile_kwargs={"render_postcompile": True}
    )
    compiled_bound = cursor.mogrify(str(compiled), compiled.params)
    sql = parse_one(compiled_bound.decode("utf-8"))


In [14]:
from cmf.data.utils import get_schema_table_names, string_to_dataset, string_to_table
from sqlalchemy import LABEL_STYLE_TABLENAME_PLUS_COL
from sqlalchemy.orm import Session

def selector_to_datatypes(selector, engine):
    types_dict = {}
    for schema_table, fields in selector.items():
        db_schema, db_table = get_schema_table_names(schema_table)
        db_table = string_to_table(db_schema, db_table, engine=engine)
        stmt = (
            select(db_table.c[tuple(fields)])
            .limit(1)
            .set_label_style(LABEL_STYLE_TABLENAME_PLUS_COL)
        )
        with Session(ENGINE) as session:
            res = pd.read_sql(stmt, session.bind).convert_dtypes(
                dtype_backend="pyarrow"
            )
        types_dict = types_dict | res.dtypes.apply(lambda x: x.name).to_dict()
    
    return types_dict


In [14]:
from sqlalchemy.orm import Session

with Session(ENGINE) as session:
    res = pd.read_sql(
        _selector_to_data(ew_selector, engine=ENGINE).limit(1),
        session.bind
    ).convert_dtypes(
        dtype_backend="pyarrow"
    )

res

Unnamed: 0,data_sha1,dbt_export_wins__wins_dataset_company_name,dbt_export_wins__wins_dataset_cdms_reference
0,b'O\xa7\xd1k\x0f\xed\xb6R\xe4X-w\x01ag\xaam\xa...,Veolia Nuclear Solutions,ORG-10039882


In [41]:
%time

import io

selector_dtypes = selector_to_datatypes(ew_selector, ENGINE)
default_dtypes =  {
    "cluster_sha1": "string[pyarrow]",
    "data_sha1": "string[pyarrow]"
}

with ENGINE.connect() as conn:
    cursor = conn.connection.cursor()
    compiled = final_stmt.compile(
        dialect=postgresql.dialect(),
        compile_kwargs={"render_postcompile": True}
    )
    compiled_bound = cursor.mogrify(str(compiled), compiled.params)
    sql = compiled_bound.decode("utf-8")
    copy_sql = f"copy ({sql}) to stdout with csv header"

    store = io.StringIO()
    cursor.copy_expert(copy_sql, store)
    store.seek(0)
    
    res = pd.read_csv(store, dtype=default_dtypes | selector_dtypes)

    if "data_sha1" in res.columns:
        res.data_sha1 = res.data_sha1.str[2:].apply(bytes.fromhex)
        res.data_sha1 = res.data_sha1.astype("binary[pyarrow]")
    if "cluster_sha1" in res.columns:
        res.cluster_sha1 = res.cluster_sha1.str[2:].apply(bytes.fromhex)
        res.cluster_sha1 = res.cluster_sha1.astype("binary[pyarrow]")
    
res.head(3)
res.info()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


0

Unnamed: 0,cluster_sha1,data_sha1,dbt_export_wins__wins_dataset_company_name,dbt_export_wins__wins_dataset_cdms_reference
0,"b'\x02\xd3\xc3\xfb\x01KnG\xc9,\x07\xb1\xc1\x11...","b'\x02\xd3\xc3\xfb\x01KnG\xc9,\x07\xb1\xc1\x11...",ETA Green Power Limited,Companies House ref: 12359858
1,b'\x04\xdfY\xad\xadtT\x1b\xed\xfd\x06w\xe9J\xf...,b'&\x04\x9a\xda~v\xbeu?F\xf0\xfd\x92\xa7IP\xfa...,Med-Eq (Europe) Ltd,ORG-10109781
2,b'\x06\xc1S\xb5p\x88SZ\xbcV\xd0a\xfbT\xad\xd3g...,"b'\x8cV\xb8[\xac\xa6K,]\xb1\x96\xbf\xfe\x1a\x9...",Silver Lined Horizons Ltd,ORG-10170829


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57658 entries, 0 to 57657
Data columns (total 4 columns):
 #   Column                                        Non-Null Count  Dtype          
---  ------                                        --------------  -----          
 0   cluster_sha1                                  57658 non-null  binary[pyarrow]
 1   data_sha1                                     57658 non-null  binary[pyarrow]
 2   dbt_export_wins__wins_dataset_company_name    57658 non-null  string         
 3   dbt_export_wins__wins_dataset_cdms_reference  57571 non-null  string         
dtypes: binary[pyarrow](2), string(2)
memory usage: 5.2 MB


20

In [36]:
bytes.fromhex(df.cluster_sha1[0][2:])

b'\x02\xd3\xc3\xfb\x01KnG\xc9,\x07\xb1\xc1\x11N\xd7\xb6\xcb\x1bq'

In [23]:
x = df.cluster_sha1[0]
bytes(x.encode("utf-8"))

b'\\x02d3c3fb014b6e47c92c07b1c1114ed7b6cb1b71'

In [30]:
x.encode("utf-8")

b'\\x02d3c3fb014b6e47c92c07b1c1114ed7b6cb1b71'

In [29]:
import hashlib
len(hashlib.sha1().digest())

20

In [16]:
%time

df = cx.read_sql(
    f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}", 
    compiled_bound.decode("utf-8"), 
    return_type="arrow"
)
df

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 10.5 µs


pyarrow.Table
cluster_sha1: large_binary
data_sha1: large_binary
dbt_export_wins__wins_dataset_company_name: string
dbt_export_wins__wins_dataset_cdms_reference: string
----
cluster_sha1: [[02D3C3FB014B6E47C92C07B1C1114ED7B6CB1B71,04DF59ADAD74541BEDFD0677E94AF4097B808FD0,06C153B57088535ABC56D061FB54ADD36795CF5A,0AA1B08A949FA0743EE7175AF7ED5FCF315397FC,0F4ED090B49C9AB5DBF7F9AA4849F466B4F654CB,...,F86BCAFA29797916160A37F25ED8AA34B70C0FCE,F8FB7C510BF37E463D0BE09101BC29EE13CA8A71,FB74D423C8D75AAAFF822C60A44BB7E704820A3E,FC2D37EF721A256FD4C88CD1F5D3722C7192C047,FFA426C5EEC58E7630A0849F1039416B3C071AE8]]
data_sha1: [[02D3C3FB014B6E47C92C07B1C1114ED7B6CB1B71,26049ADA7E76BE753F46F0FD92A74950FAD49762,8C56B85BACA64B2C5DB196BFFE1A952B65B5A039,4B6181C1E38124BBED5DB64F3B74C6278531A824,0F4ED090B49C9AB5DBF7F9AA4849F466B4F654CB,...,7CBF24EED28CF959FAE9B1250518B59847643755,330BB80215A604659B5F3D897B26AE7C5C88E220,87581D5F0D58AF9E1A40DBDF2AF711C534A6AC81,1129391B5B703884860DC15E27386F9DC7A0B41B,37EEEADA

In [None]:
%time

df = cx.read_sql(
    f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}", 
    sql.sql(dialect="postgres"), 
    return_type="polars"
)
df.head(3)

In [42]:
print(sql.sql(dialect="postgres", pretty=True))

WITH RECURSIVE source_data_unnested AS (
  SELECT
    _team_cmf.cmf__source_data.sha1 AS sha1,
    UNNEST(_team_cmf.cmf__source_data.id) AS id,
    _team_cmf.cmf__source_data.dataset AS dataset
  FROM _team_cmf.cmf__source_data
), anon_1 AS (
  SELECT
    source_data_unnested.sha1 AS data_sha1,
    dbt.export_wins__wins_dataset.company_name AS dbt_export_wins__wins_dataset_company_name,
    dbt.export_wins__wins_dataset.cdms_reference AS dbt_export_wins__wins_dataset_cdms_reference
  FROM source_data_unnested
  LEFT OUTER JOIN dbt.export_wins__wins_dataset
    ON source_data_unnested.id = CAST(dbt.export_wins__wins_dataset.id AS VARCHAR)
    AND source_data_unnested.dataset = CAST(CAST('cc89099f-d065-49cc-aa45-e08e1db6653a' AS UUID) AS UUID)
  WHERE
    NOT dbt.export_wins__wins_dataset.id IS NULL
), allowed AS (
  SELECT
    _team_cmf.cmf__ddupe_contains.parent AS parent,
    _team_cmf.cmf__ddupe_contains.child AS child
  FROM _team_cmf.cmf__ddupe_contains
  JOIN _team_cmf.cmf__cluste

In [41]:
from sqlglot.optimizer import optimize

optimised = optimize(
    sql,
    schema={
        "_team_cmf.cmf__source_data": {
            "sha1": "BINARY",
            "id": "STRING",
            "dataset": "UUID",
        },
        "_team_cmf.cmf__clusters": {
            "sha1": "BINARY",
        },
        "_team_cmf.cmf__models_create_clusters": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__models": {
            "sha1": "BINARY",
        },
        "_team_cmf.cmf__link_contains": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__ddupe_contains": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__clusters": {
            "sha1": "BINARY",
        },
        "dbt.export_wins__wins_dataset": {
            "id": "STRING",
            "company_name": "STRING",
            "cdms_reference": "STRING",
        }
    }
)

print(optimised.sql(dialect="postgres", pretty=True))

WITH RECURSIVE "allowed" AS (
  SELECT
    "_team_cmf"."cmf__ddupe_contains"."parent" AS "parent",
    "_team_cmf"."cmf__ddupe_contains"."child" AS "child"
  FROM "_team_cmf"."cmf__ddupe_contains" AS "cmf__ddupe_contains"
  JOIN "_team_cmf"."cmf__clusters" AS "cmf__clusters_1"
    ON "_team_cmf"."cmf__ddupe_contains"."parent" = "cmf__clusters_1"."sha1"
  JOIN "_team_cmf"."cmf__models_create_clusters" AS "cmf__models_create_clusters"
    ON "_team_cmf"."cmf__models_create_clusters"."child" = "cmf__clusters_1"."sha1"
  JOIN "_team_cmf"."cmf__models" AS "cmf__models"
    ON "_team_cmf"."cmf__models"."sha1" = "_team_cmf"."cmf__models_create_clusters"."parent"
    AND "_team_cmf"."cmf__models"."sha1" IN (CAST('\x17a46e0ea365597922a07f6b7bb01b9956807b21' AS BYTEA))
  UNION
  SELECT
    "_team_cmf"."cmf__link_contains"."parent" AS "parent",
    "_team_cmf"."cmf__link_contains"."child" AS "child"
  FROM "_team_cmf"."cmf__link_contains" AS "cmf__link_contains"
  JOIN "_team_cmf"."cmf__clusters"

# Companies House

Should take forever -- timed out for me.

And yet in PGAdmin, 2 mins. Wtf?!

In [5]:
model = "naive_companies_house_v1"
ch_selector = selector(
    table="companieshouse.companies",
    fields=["company_name", "company_number", "postcode"],
)

# We want raw data with clusters attached
parent, child = _parent_to_tree(model, engine=ENGINE)
if len(parent) == 0:
    raise ValueError(f"Model {model} not found")
tree = [parent] + child
reachable_stmt = _tree_to_reachable_stmt(tree)
lookup_stmt = _reachable_to_parent_data_stmt(reachable_stmt, parent)
data_stmt = _selector_to_data(ch_selector, engine=ENGINE).cte()

final_stmt = select(lookup_stmt.c.parent.label("cluster_sha1"), data_stmt).join(
    lookup_stmt, lookup_stmt.c.child == data_stmt.c.data_sha1
)

with ENGINE.connect() as conn:
    cursor = conn.connection.cursor()
    compiled = final_stmt.compile(
        dialect=postgresql.dialect(),
        compile_kwargs={"render_postcompile": True}
    )
    compiled_bound = cursor.mogrify(str(compiled), compiled.params)
    sql = parse_one(compiled_bound.decode("utf-8"))

print(sql.sql(dialect="postgres", pretty=True))

WITH RECURSIVE source_data_unnested AS (
  SELECT
    _team_cmf.cmf__source_data.sha1 AS sha1,
    UNNEST(_team_cmf.cmf__source_data.id) AS id,
    _team_cmf.cmf__source_data.dataset AS dataset
  FROM _team_cmf.cmf__source_data
), anon_1 AS (
  SELECT
    source_data_unnested.sha1 AS data_sha1,
    companieshouse.companies.company_name AS companieshouse_companies_company_name,
    companieshouse.companies.company_number AS companieshouse_companies_company_number,
    companieshouse.companies.postcode AS companieshouse_companies_postcode
  FROM source_data_unnested
  LEFT OUTER JOIN companieshouse.companies
    ON source_data_unnested.id = CAST(companieshouse.companies.id AS VARCHAR)
    AND source_data_unnested.dataset = CAST(CAST('592b69e0-ce95-47a6-9f0a-bcd792f214a4' AS UUID) AS UUID)
  WHERE
    NOT companieshouse.companies.id IS NULL
), allowed AS (
  SELECT
    _team_cmf.cmf__ddupe_contains.parent AS parent,
    _team_cmf.cmf__ddupe_contains.child AS child
  FROM _team_cmf.cmf__dd

Maybe let's try running this compiled SQL directly with SQLAlchemy.

In [11]:
%time

from sqlalchemy.orm import Session
from sqlalchemy import text

with sqa_profiled():
    with Session(ENGINE) as session:
        res = session.execute(text(sql.sql(dialect="postgres")))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
         13915 function calls (12898 primitive calls) in 96.436 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/session.py:2245(execute)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/session.py:2078(_execute_internal)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1377(execute)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/sql/elements.py:505(_execute_on_connection)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1593(_execute

In [12]:
res.first()

(<memory at 0x7f29b60e8ac0>, <memory at 0x7f29b60e8940>, 'ARCADE GEEKS INT LTD', '13231865', 'DY13 9RH')