In [2]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"
# pip install dwutils@git+ssh://git@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git@latest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import logging

from pandas import DataFrame
from typing import Optional
from sqlglot import parse_one

from sqlalchemy import select
from sqlalchemy.dialects import postgresql

import cmf
from cmf.helpers import selector
from cmf.data.utils import sqa_profiled
from cmf.helpers.selector import _parent_to_tree, _tree_to_reachable_stmt, _reachable_to_parent_data_stmt, _selector_to_data
from cmf.data import ENGINE

def create_cmf_pipelines_logger() -> logging.Logger:
    pipeline_logger = logging.getLogger("cmf_pipelines")
    logic_logger = logging.getLogger("cmf_logic")

    pipeline_logger.setLevel(logging.INFO)
    logic_logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "[%(asctime)s: %(levelname)s] %(name)s %(module)s: %(message)s"
    )
    handler.setFormatter(formatter)

    pipeline_logger.addHandler(handler)
    logic_logger.addHandler(handler)

    return pipeline_logger

logger = create_cmf_pipelines_logger()

# Speeding up queries

Everything is slower than I thought. Let's profile it.

When I compiled and ran it was faster. Now when I run in SQLAlchemy it's faster. All I can conclude is that overall database load is what's screwing with this.

In [13]:
%time

# SAMPLE = 10_000
_SOURCE_L = "naive_data_hub_v1"

dh_selector = selector(
    table="dbt.data_hub__companies",
    fields=["name", "company_number", "address_postcode"],
)

with sqa_profiled():
    dh_raw = cmf.query(
        selector=dh_selector, return_type="pandas", model=_SOURCE_L#, limit=SAMPLE
    )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs
         1143820 function calls (1143121 primitive calls) in 186.151 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.076    0.076  186.151  186.151 /home/theia/company-matching/cmf/helpers/selector.py:297(query)
        1    0.046    0.046  185.713  185.713 /opt/conda/envs/company_matching/lib/python3.9/site-packages/pandas/io/sql.py:570(read_sql)
        1    0.000    0.000  185.663  185.663 /opt/conda/envs/company_matching/lib/python3.9/site-packages/pandas/io/sql.py:1779(read_query)
       15    0.000    0.000  180.903   12.060 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/sql/elements.py:505(_execute_on_connection)
       15    0.000    0.000  180.903   12.060 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1593(_execute_clauseelement)
       14    0.000    0.000  180.902   12

In [10]:
dh_raw.head(3)

Unnamed: 0,cluster_sha1,data_sha1,dbt_data_hub__companies_name,dbt_data_hub__companies_company_number,dbt_data_hub__companies_address_postcode
0,b'\x00\x00\x8a\x9c\x95\xa8\xf2\xd4c=:pa\x86\xa...,b'\x00\x00\x8a\x9c\x95\xa8\xf2\xd4c=:pa\x86\xa...,COLOUR TONE MASTERBATCH LIMITED,3176153.0,CF83 8YE
1,"b'\x00\x05\xce\xde\r\xf1\xa4""$\x94b6\xd7g\xe0\...","b'\x00\x05\xce\xde\r\xf1\xa4""$\x94b6\xd7g\xe0\...",Fourays Fashion,,NG7 5GQ
2,"b'\x00\x07\x93\xb9,\xcf\xcd\xe1h\xad\x8f\x88!\...","b'\x00\x07\x93\xb9,\xcf\xcd\xe1h\xad\x8f\x88!\...",JFL BROACHES & BROACHING LIMITED,6917128.0,MK17 8UR


In [14]:
%time

SAMPLE = 10_000
_SOURCE_L = "naive_export_wins_v1"

ew_selector = selector(
    table="dbt.export_wins__wins_dataset",
    fields=["company_name", "cdms_reference"],
)

with sqa_profiled():
    ew_raw = cmf.query(
        selector=ew_selector, return_type="sqlalchemy", model=_SOURCE_L, limit=SAMPLE
    )

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
         24468 function calls (23914 primitive calls) in 253.095 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  253.095  253.095 /home/theia/company-matching/cmf/helpers/selector.py:297(query)
       14    0.000    0.000  253.074   18.077 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1377(execute)
       14    0.000    0.000  253.074   18.077 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/sql/elements.py:505(_execute_on_connection)
       14    0.000    0.000  253.074   18.077 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1593(_execute_clauseelement)
       14    0.000    0.000  253.066   18.076 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1794(_execute_context)
       14    0.000   

In [12]:
type(ew_raw)

sqlalchemy.engine.result.ChunkedIteratorResult

# Export wins

Takes ages here, but runs VERY fast in PG directly.

In [15]:
model = "naive_export_wins_v1"
ew_selector = selector(
    table="dbt.export_wins__wins_dataset",
    fields=["company_name", "cdms_reference"],
)

# We want raw data with clusters attached
parent, child = _parent_to_tree(model, engine=ENGINE)
if len(parent) == 0:
    raise ValueError(f"Model {model} not found")
tree = [parent] + child
reachable_stmt = _tree_to_reachable_stmt(tree)
lookup_stmt = _reachable_to_parent_data_stmt(reachable_stmt, parent)
data_stmt = _selector_to_data(ew_selector, engine=ENGINE).cte()

final_stmt = select(lookup_stmt.c.parent.label("cluster_sha1"), data_stmt).join(
    lookup_stmt, lookup_stmt.c.child == data_stmt.c.data_sha1
)

In [33]:
with ENGINE.connect() as conn:
    cursor = conn.connection.cursor()
    compiled = final_stmt.compile(
        dialect=postgresql.dialect(),
        compile_kwargs={"render_postcompile": True}
    )
    compiled_bound = cursor.mogrify(str(compiled), compiled.params)
    sql = parse_one(compiled_bound.decode("utf-8"))


In [42]:
print(sql.sql(dialect="postgres", pretty=True))

WITH RECURSIVE source_data_unnested AS (
  SELECT
    _team_cmf.cmf__source_data.sha1 AS sha1,
    UNNEST(_team_cmf.cmf__source_data.id) AS id,
    _team_cmf.cmf__source_data.dataset AS dataset
  FROM _team_cmf.cmf__source_data
), anon_1 AS (
  SELECT
    source_data_unnested.sha1 AS data_sha1,
    dbt.export_wins__wins_dataset.company_name AS dbt_export_wins__wins_dataset_company_name,
    dbt.export_wins__wins_dataset.cdms_reference AS dbt_export_wins__wins_dataset_cdms_reference
  FROM source_data_unnested
  LEFT OUTER JOIN dbt.export_wins__wins_dataset
    ON source_data_unnested.id = CAST(dbt.export_wins__wins_dataset.id AS VARCHAR)
    AND source_data_unnested.dataset = CAST(CAST('cc89099f-d065-49cc-aa45-e08e1db6653a' AS UUID) AS UUID)
  WHERE
    NOT dbt.export_wins__wins_dataset.id IS NULL
), allowed AS (
  SELECT
    _team_cmf.cmf__ddupe_contains.parent AS parent,
    _team_cmf.cmf__ddupe_contains.child AS child
  FROM _team_cmf.cmf__ddupe_contains
  JOIN _team_cmf.cmf__cluste

In [41]:
from sqlglot.optimizer import optimize

optimised = optimize(
    sql,
    schema={
        "_team_cmf.cmf__source_data": {
            "sha1": "BINARY",
            "id": "STRING",
            "dataset": "UUID",
        },
        "_team_cmf.cmf__clusters": {
            "sha1": "BINARY",
        },
        "_team_cmf.cmf__models_create_clusters": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__models": {
            "sha1": "BINARY",
        },
        "_team_cmf.cmf__link_contains": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__ddupe_contains": {
            "parent": "BINARY",
            "child": "BINARY",
        },
        "_team_cmf.cmf__clusters": {
            "sha1": "BINARY",
        },
        "dbt.export_wins__wins_dataset": {
            "id": "STRING",
            "company_name": "STRING",
            "cdms_reference": "STRING",
        }
    }
)

print(optimised.sql(dialect="postgres", pretty=True))

WITH RECURSIVE "allowed" AS (
  SELECT
    "_team_cmf"."cmf__ddupe_contains"."parent" AS "parent",
    "_team_cmf"."cmf__ddupe_contains"."child" AS "child"
  FROM "_team_cmf"."cmf__ddupe_contains" AS "cmf__ddupe_contains"
  JOIN "_team_cmf"."cmf__clusters" AS "cmf__clusters_1"
    ON "_team_cmf"."cmf__ddupe_contains"."parent" = "cmf__clusters_1"."sha1"
  JOIN "_team_cmf"."cmf__models_create_clusters" AS "cmf__models_create_clusters"
    ON "_team_cmf"."cmf__models_create_clusters"."child" = "cmf__clusters_1"."sha1"
  JOIN "_team_cmf"."cmf__models" AS "cmf__models"
    ON "_team_cmf"."cmf__models"."sha1" = "_team_cmf"."cmf__models_create_clusters"."parent"
    AND "_team_cmf"."cmf__models"."sha1" IN (CAST('\x17a46e0ea365597922a07f6b7bb01b9956807b21' AS BYTEA))
  UNION
  SELECT
    "_team_cmf"."cmf__link_contains"."parent" AS "parent",
    "_team_cmf"."cmf__link_contains"."child" AS "child"
  FROM "_team_cmf"."cmf__link_contains" AS "cmf__link_contains"
  JOIN "_team_cmf"."cmf__clusters"

# Companies House

Should take forever -- timed out for me.

And yet in PGAdmin, 2 mins. Wtf?!

In [5]:
model = "naive_companies_house_v1"
ch_selector = selector(
    table="companieshouse.companies",
    fields=["company_name", "company_number", "postcode"],
)

# We want raw data with clusters attached
parent, child = _parent_to_tree(model, engine=ENGINE)
if len(parent) == 0:
    raise ValueError(f"Model {model} not found")
tree = [parent] + child
reachable_stmt = _tree_to_reachable_stmt(tree)
lookup_stmt = _reachable_to_parent_data_stmt(reachable_stmt, parent)
data_stmt = _selector_to_data(ch_selector, engine=ENGINE).cte()

final_stmt = select(lookup_stmt.c.parent.label("cluster_sha1"), data_stmt).join(
    lookup_stmt, lookup_stmt.c.child == data_stmt.c.data_sha1
)

with ENGINE.connect() as conn:
    cursor = conn.connection.cursor()
    compiled = final_stmt.compile(
        dialect=postgresql.dialect(),
        compile_kwargs={"render_postcompile": True}
    )
    compiled_bound = cursor.mogrify(str(compiled), compiled.params)
    sql = parse_one(compiled_bound.decode("utf-8"))

print(sql.sql(dialect="postgres", pretty=True))

WITH RECURSIVE source_data_unnested AS (
  SELECT
    _team_cmf.cmf__source_data.sha1 AS sha1,
    UNNEST(_team_cmf.cmf__source_data.id) AS id,
    _team_cmf.cmf__source_data.dataset AS dataset
  FROM _team_cmf.cmf__source_data
), anon_1 AS (
  SELECT
    source_data_unnested.sha1 AS data_sha1,
    companieshouse.companies.company_name AS companieshouse_companies_company_name,
    companieshouse.companies.company_number AS companieshouse_companies_company_number,
    companieshouse.companies.postcode AS companieshouse_companies_postcode
  FROM source_data_unnested
  LEFT OUTER JOIN companieshouse.companies
    ON source_data_unnested.id = CAST(companieshouse.companies.id AS VARCHAR)
    AND source_data_unnested.dataset = CAST(CAST('592b69e0-ce95-47a6-9f0a-bcd792f214a4' AS UUID) AS UUID)
  WHERE
    NOT companieshouse.companies.id IS NULL
), allowed AS (
  SELECT
    _team_cmf.cmf__ddupe_contains.parent AS parent,
    _team_cmf.cmf__ddupe_contains.child AS child
  FROM _team_cmf.cmf__dd

Maybe let's try running this compiled SQL directly with SQLAlchemy.

In [11]:
%time

from sqlalchemy.orm import Session
from sqlalchemy import text

with sqa_profiled():
    with Session(ENGINE) as session:
        res = session.execute(text(sql.sql(dialect="postgres")))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs
         13915 function calls (12898 primitive calls) in 96.436 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/session.py:2245(execute)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/session.py:2078(_execute_internal)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1377(execute)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/sql/elements.py:505(_execute_on_connection)
        1    0.000    0.000   96.429   96.429 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/engine/base.py:1593(_execute

In [12]:
res.first()

(<memory at 0x7f29b60e8ac0>, <memory at 0x7f29b60e8940>, 'ARCADE GEEKS INT LTD', '13231865', 'DY13 9RH')