In [3]:
!pip install dwutils@git+ssh://git@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git@latest

Looking in indexes: https://s3-eu-west-2.amazonaws.com/mirrors.notebook.uktrade.io/pypi/
Collecting dwutils@ git+ssh://****@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git@latest
  Cloning ssh://****@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git (to revision latest) to /tmp/pip-install-e41jcl0i/dwutils_f4b1526497354be2bfcac10880e133e4
  Running command git clone --filter=blob:none --quiet 'ssh://****@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git' /tmp/pip-install-e41jcl0i/dwutils_f4b1526497354be2bfcac10880e133e4
  Resolved ssh://****@gitlab.data.trade.gov.uk/ddatdatascienceteam/data-workspace-utilities.git to commit 20144945565fe9e71c91311da3401156e12095ed
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Massive connected components

Connected components crashes on 90m probabilities. We need to be able to handle that and more.

In [2]:
import cmf
from cmf import clean
from cmf.clean import steps
from cmf.data.utils import sqa_profiled
from cmf.dedupers import NaiveDeduper
from cmf.helpers import cleaner, cleaners, selector
from cmf.data.results import ClusterResults, ProbabilityResults

import logging

from dwutils import s3

import pandas as pd
from pandas import DataFrame
import pyarrow as pa
import rustworkx as rx

from typing import Optional

def create_cmf_pipelines_logger() -> logging.Logger:
    pipeline_logger = logging.getLogger("cmf_pipelines")
    logic_logger = logging.getLogger("cmf_logic")

    pipeline_logger.setLevel(logging.INFO)
    logic_logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "[%(asctime)s: %(levelname)s] %(name)s %(module)s: %(message)s"
    )
    handler.setFormatter(formatter)

    pipeline_logger.addHandler(handler)
    logic_logger.addHandler(handler)

    return pipeline_logger


logger = create_cmf_pipelines_logger()

ignoring unparsable config /home/jovyan/company-matching/pyproject.toml
ignoring unparsable config /home/jovyan/company-matching/pyproject.toml


## Pipeline

In [3]:
_NAME = "naive_hmrc_exports_v1"
_SOURCE = "hmrc.trade__exporters"


def _query(limit: Optional[int] = None) -> DataFrame:
    """Select data."""

    exp_selector = selector(
        table=_SOURCE,
        fields=["company_name", "postcode"],
    )

    exp_raw = cmf.query(selector=exp_selector, return_type="pandas", limit=limit)

    logger.info(
        "Data retrieved successfully with %s unique datapoints",
        exp_raw.data_sha1.nunique(),
    )

    return exp_raw


def _process(raw: DataFrame) -> DataFrame:
    """Clean data."""

    clean_exp = cleaners(
        cleaner(clean.company_name, {"column": "hmrc_trade__exporters_company_name"}),
        cleaner(clean.postcode, {"column": "hmrc_trade__exporters_postcode"}),
    )

    exp_clean = cmf.process(raw, clean_exp)

    logger.info("Data cleaned successfully")

    return exp_clean


def _deduplicate(clean: DataFrame) -> ProbabilityResults:
    """Deduplicate data."""

    exp_naive_deduper = cmf.make_deduper(
        dedupe_run_name=_NAME,
        description="Basic cleaning of name and postcode.",
        deduper=NaiveDeduper,
        deduper_settings={
            "id": "data_sha1",
            "unique_fields": [
                "hmrc_trade__exporters_company_name",
                "hmrc_trade__exporters_postcode",
            ],
        },
        data=clean,
        data_source=_SOURCE,
    )

    exp_deduped = exp_naive_deduper()

    logger.info(
        "Data deduplicated successfully. %s probabilities generated",
        exp_deduped.dataframe.shape[0],
    )

    return exp_deduped


def _cluster(deduped: ProbabilityResults, clean: DataFrame) -> ClusterResults:
    """Resolve probabilities to clusters."""
    exp_clusters = cmf.to_clusters(clean, results=deduped, key="data_sha1", threshold=1)

    logger.info(
        "Clusters resolved successfully. %s clusters generated",
        exp_clusters.dataframe.parent.nunique(),
    )

    return exp_clusters


In [4]:
ew_raw = _query(limit=300_000)
ew_clean = _process(raw=ew_raw)
ew_deduped = _deduplicate(clean=ew_clean)
ew_clusters = _cluster(deduped=ew_deduped, clean=ew_clean)

[2024-03-12 11:26:40,074: INFO] cmf_pipelines 2290665410: Data retrieved successfully with 300000 unique datapoints
[2024-03-12 11:26:43,184: INFO] cmf_pipelines 2290665410: Data cleaned successfully
[2024-03-12 11:26:44,519: INFO] cmf_pipelines 2290665410: Data deduplicated successfully. 567484 probabilities generated
[2024-03-12 11:26:47,066: INFO] cmf_pipelines 2290665410: Clusters resolved successfully. 109616 clusters generated


## Playing around

In [49]:
from dwutils import db

db.query(f"select count(*) from {_SOURCE};")

Unnamed: 0,count
0,3793000


For 567,484 probabilities using the `WriteOnlyMapped` methodology.

* 394 seconds at 500k batch size
* 585 seconds at 250k batch size
    * `execute` and `_emit_insert_statements` are like 400s of that
    * 390 on second run
* 370 seconds at 100k batch size
* 370 seconds at 50k batch 
* 370ish seconds at 10k batch

Concerned the first-run test absorbs a lot of the processing time.

For 567,484 probabilities using the `pg-bulg-ingest` methodology.

* xx seconds at 500k batch size
* xx seconds at 250k batch size
* xx seconds at 100k batch size
* xx seconds at 50k batch 
* xx seconds at 10k batch

In [37]:
df = ew_deduped.dataframe.head(10).to_records(index=None)

In [38]:
df[0]

(b'\x03\xfb\xaf\xea\xb1\xe3O\xcbY\x11p\\2\x83\x19\xf8\xb4\xe1L\x1d', b']\x92\x90V\xda\xc2\xe0\xbe\t\xb385\x9bx%f{\xdc\x07O', 1, b'\x1a\x86\x83\xbf\xe8I\x8f\x14\xe7\xe8i\xe0\xa5D\x16w8\x05,R')

In [1]:
from cmf.data import Dedupes

In [2]:
Dedupes.__table__.metadata

MetaData()

In [52]:
batch = ew_deduped._to_batch(
    dataframe=ew_deduped._prep_to_cmf(ew_deduped.dataframe)[["sha1", "left", "right"]], 
    table=Dedupes.__table__
)

In [55]:
ew_deduped._prep_to_cmf(ew_deduped.dataframe)[["sha1", "left", "right"]].head(3)

Unnamed: 0,sha1,left,right
0,b'\x1a\x86\x83\xbf\xe8I\x8f\x14\xe7\xe8i\xe0\x...,b'\x03\xfb\xaf\xea\xb1\xe3O\xcbY\x11p\\2\x83\x...,b']\x92\x90V\xda\xc2\xe0\xbe\t\xb385\x9bx%f{\x...
1,"b'\x08\xdd\x1e\x85r\xa6\x14""\x1b>r3\x85E\xd4|\...",b'v\xc6\xa7\tM\xbay\x96\x19\x03e\xe8\xec\xb6r\...,b'\xa5\xceT\x06\xad\x8eg\xaa\x81\xc6\n\x9bs\x9...
2,b'\xe8{J\xc0[\xfa\xe7Y\xd4M\t\xf1V\x9a\x07\x1b...,b'#\t\x8a|1\x01\xfb|u\xb3\xcdf\xf7\xa3\x97\xbf...,b'\xad>H\x14\x83\x15&\xe5\xfcn\xb3\xef\x8a\xa0...


In [None]:
for i in batch(None):
    i
    break

In [5]:
import os

ew_deduped._batch_size = 250_000

with sqa_profiled():
    ew_deduped.to_cmf()

[2024-03-12 11:32:18,530: INFO] cmf_logic results: [naive_hmrc_exports_v1, ProbabilityResults] Registering model
[2024-03-12 11:32:18,570: INFO] cmf_logic results: [naive_hmrc_exports_v1, ProbabilityResults] Writing deduplication data with batch size 250000


AttributeError: 'builtin_function_or_method' object has no attribute 'upper'

In [5]:
with s3.read(path="hmrc_exporters_probabilities.parquet") as f:
    exp_deduped = pd.read_parquet(f, dtype_backend="pyarrow")

exp_deduped.left_id = exp_deduped.left_id.astype("binary[pyarrow]")
exp_deduped.right_id = exp_deduped.right_id.astype("binary[pyarrow]")

exp_deduped.head(3)
exp_deduped.info()

Unnamed: 0,left_id,right_id,probability
0,b'\\s31\x86\xbb\xd0s\xa2\x92\x8a\xadI< \xc7^+l...,b'v\xa9=\x14\xc2\xc2~\xa7\xbe\xb9\xa2\xe6\xe2M...,1
1,b'?@\xf4\xa9\xbeBQ\xa8\x7fn\xcbT\xac\xedL\x05\...,b'\xf3\xce\xa4\xe4H\r\xcf\xaf\x11IfH\xf9\xc4\x...,1
2,b'\xfe^[\xea\xecLt\x08O\x0b\x11.\xdf*\xcb\x89K...,b'-I\xf4:\xb6\xeb\xb4\xd9\xbb\xe0\xc4\xb7V4\xc...,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90276899 entries, 0 to 90276898
Data columns (total 3 columns):
 #   Column       Dtype          
---  ------       -----          
 0   left_id      binary[pyarrow]
 1   right_id     binary[pyarrow]
 2   probability  int32[pyarrow] 
dtypes: binary[pyarrow](2), int32[pyarrow](1)
memory usage: 4.4 GB


In [13]:
all_edges = (
    exp_deduped
    .query("probability >= 1")
    .filter(["left_id", "right_id"])
    .itertuples(index=False, name=None)
)

In [14]:
G = rx.PyGraph()
added = {}

for edge in all_edges:
    edge_idx = []
    for sha1 in edge:
        sha1_idx = added.get(sha1)
        if sha1_idx is None:
            sha1_idx = G.add_node(sha1)
            added[sha1] = sha1_idx
        edge_idx.append(sha1_idx)
    edge_idx.append(None)
    _ = G.add_edge(*edge_idx)


In [15]:
rx.number_connected_components(G)

187004

In [19]:
for edge in all_edges:
    print(edge)
    break

b'\\s31\x86\xbb\xd0s\xa2\x92\x8a\xadI< \xc7^+l\xdf'


In [23]:
(
    exp_deduped
    .head(100_000)
    .query("probability >= 1")
    .filter(["left_id", "right_id"])
    .to_records()
)[0]

(0, b'\\s31\x86\xbb\xd0s\xa2\x92\x8a\xadI< \xc7^+l\xdf', b'v\xa9=\x14\xc2\xc2~\xa7\xbe\xb9\xa2\xe6\xe2M\xca\x9d\xf6(\x0b1')