In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import mlflow
import duckdb
import json
from pathlib import Path
import pandas as pd
# import dask.dataframe as dd

from splink.duckdb.linker import DuckDBLinker
from splink.postgres.linker import PostgresLinker

from src.data import utils as du
import src.locations as loc
from src.config import settings

DATA_FULL = du.build_alias_path_dict(Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full')
PRED_PATH = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full' / 'predictions.parquet'

Questions:

1. Can we predict in batches?
2. If we predict in batches, do we get the same answer as when not?
3. Does this alleviate memory issues?
4. Does this work with clustering?

29/7 update: didn't even need to do batch stuff. CLUSTERING is the problem -- predict is fine. Opens up new avenues.

Let's see where clustering fails specifically.

## Repartition into multiple files

In [47]:
data_partitioned = {}

for data in data_full.keys():
    df = dd.read_parquet(data_full[data])
    df = df.repartition(partition_size="100MB")
    new_dir = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__partitioned' / data
    df.to_parquet(new_dir)
    data_partitioned[data] = f"'{new_dir.as_posix()}'"
    
data_partitioned

{'hmrc_trade__exporters': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/hmrc_trade__exporters'",
 'dit_export_wins__wins_dataset': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_export_wins__wins_dataset'",
 'dit_data_hub__companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_data_hub__companies'",
 'companieshouse_companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/companieshouse_companies'"}

## Generate predictions and stash

In [3]:
json_raw = mlflow.artifacts.load_text(
    artifact_uri="runs:/22ce217706c54650ac34f59cb6a45960/model/companies_matching_model.json"
)
json_settings = json.loads(json_raw)

In [4]:
connection = duckdb.connect()

In [5]:
linker = DuckDBLinker(
    list(DATA_FULL.values()),
    settings_dict=settings,
    connection=connection,
    input_table_aliases=list(DATA_FULL.keys()),
)
linker.load_model(json_settings)

In [6]:
predictions = linker.predict(threshold_match_probability=0.7)

SplinkException: Error executing the following sql for table `__splink__df_concat_with_tf`(__splink__df_concat_with_tf_52d091831):
CREATE TABLE __splink__df_concat_with_tf_52d091831 AS
(
  WITH __splink__df_concat AS (
    SELECT
      'hmrc_trade__exporters' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/hmrc_trade__exporters.parquet'
    )
    UNION ALL
    SELECT
      'dit_export_wins__wins_dataset' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/dit_export_wins__wins_dataset.parquet'
    )
    UNION ALL
    SELECT
      'dit_data_hub__companies' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/dit_data_hub__companies.parquet'
    )
    UNION ALL
    SELECT
      'predictions' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/predictions.parquet'
    )
    UNION ALL
    SELECT
      'companieshouse_companies' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/companieshouse_companies.parquet'
    )
  ), __splink__df_tf_name_unusual_tokens AS (
    SELECT
      "name_unusual_tokens",
      CAST(COUNT(*) AS DOUBLE) / (
        SELECT
          COUNT("name_unusual_tokens") AS total
        FROM __splink__df_concat
      ) AS "tf_name_unusual_tokens"
    FROM __splink__df_concat
    WHERE
      NOT "name_unusual_tokens" IS NULL
    GROUP BY
      "name_unusual_tokens"
  ), __splink__df_tf_comp_num_clean AS (
    SELECT
      "comp_num_clean",
      CAST(COUNT(*) AS DOUBLE) / (
        SELECT
          COUNT("comp_num_clean") AS total
        FROM __splink__df_concat
      ) AS "tf_comp_num_clean"
    FROM __splink__df_concat
    WHERE
      NOT "comp_num_clean" IS NULL
    GROUP BY
      "comp_num_clean"
  )
  SELECT
    __splink__df_concat.*,
    __splink__df_tf_name_unusual_tokens."tf_name_unusual_tokens",
    __splink__df_tf_comp_num_clean."tf_comp_num_clean"
  FROM __splink__df_concat
  LEFT JOIN __splink__df_tf_name_unusual_tokens
    ON __splink__df_concat."name_unusual_tokens" = __splink__df_tf_name_unusual_tokens."name_unusual_tokens"
  LEFT JOIN __splink__df_tf_comp_num_clean
    ON __splink__df_concat."comp_num_clean" = __splink__df_tf_comp_num_clean."comp_num_clean"
)

Error was: Binder Error: Referenced column "postcode_area" not found in FROM clause!
Candidate bindings: "read_parquet.match_key"

In [10]:
connection.query("""
    pragma database_size;
    call pragma_database_size();
""")

┌───────────────┬───────────────┬────────────┬──────────────┬───┬─────────────┬──────────┬──────────────┬──────────────┐
│ database_name │ database_size │ block_size │ total_blocks │ … │ free_blocks │ wal_size │ memory_usage │ memory_limit │
│    varchar    │    varchar    │   int64    │    int64     │   │    int64    │ varchar  │   varchar    │   varchar    │
├───────────────┼───────────────┼────────────┼──────────────┼───┼─────────────┼──────────┼──────────────┼──────────────┤
│ memory        │ 0 bytes       │          0 │            0 │ … │           0 │ 0 bytes  │ 11.0GB       │ 26.4GB       │
├───────────────┴───────────────┴────────────┴──────────────┴───┴─────────────┴──────────┴──────────────┴──────────────┤
│ 1 rows                                                                                           9 columns (8 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [18]:
connection.query(f"""
    copy {predictions.physical_name}
    to '{PRED_PATH.as_posix()}'
    (format parquet);
""")

In [10]:
connection.query(f"""
    select *
    from '{PRED_PATH.as_posix()}'
""")

┌────────────────────┬────────────────────┬───┬──────────────────────┬──────────────────────┬───────────┐
│    match_weight    │ match_probability  │ … │     unique_id_l      │     unique_id_r      │ match_key │
│       double       │       double       │   │       varchar        │       varchar        │  varchar  │
├────────────────────┼────────────────────┼───┼──────────────────────┼──────────────────────┼───────────┤
│ 1.4157836817616756 │ 0.7273753031699655 │ … │ b18c76fc-a30d-e411…  │ 11f864f6-09bc-4cf8…  │ 0         │
│ 20.075070679022463 │ 0.9999990946819749 │ … │ 7a7fd6b2-4f0e-e411…  │ 19a1c784-0e8c-4b9b…  │ 0         │
│ 18.852678257686016 │ 0.9999978875938246 │ … │ e7275be8-7e11-e411…  │ 3d895056-4ffd-4a15…  │ 0         │
│ 21.338105084856256 │  0.999999622783957 │ … │ 6afa126b-a911-e411…  │ 03ebd8a5-f065-423a…  │ 0         │
│  18.20060156110632 │ 0.9999966805085885 │ … │ f7f4ddb5-4d12-e411…  │ 600dcf7f-7087-46f3…  │ 0         │
│ 22.075070679022463 │ 0.9999997736703401 │ … 

In [None]:
du.data_workspace_write(
    schema = "_user_eaf4fd9a",
    table = "lge_all_predictions",
    df = pd.read_parquet(PRED_PATH),
    if_exists = "replace",
    chunksize = int(1e6)
)

## Fix clustering

In [5]:
df_predict = pd.read_parquet(PRED_PATH)

In [5]:
df_predict.shape

(79576965, 7)

In [6]:
df_predict.head(5)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key
0,1.415784,0.727375,dit_data_hub__companies,dit_export_wins__wins_dataset,b18c76fc-a30d-e411-8a2b-e4115bead28a,11f864f6-09bc-4cf8-969e-ae790c28aec7,0
1,20.075071,0.999999,dit_data_hub__companies,dit_export_wins__wins_dataset,7a7fd6b2-4f0e-e411-8a2b-e4115bead28a,19a1c784-0e8c-4b9b-b40f-f4daa5d9bd01,0
2,18.852678,0.999998,dit_data_hub__companies,dit_export_wins__wins_dataset,e7275be8-7e11-e411-8a2b-e4115bead28a,3d895056-4ffd-4a15-91d3-05e6def6e606,0
3,21.338105,1.0,dit_data_hub__companies,dit_export_wins__wins_dataset,6afa126b-a911-e411-8a2b-e4115bead28a,03ebd8a5-f065-423a-b9f4-de78360b0faf,0
4,18.200602,0.999997,dit_data_hub__companies,dit_export_wins__wins_dataset,f7f4ddb5-4d12-e411-8a2b-e4115bead28a,600dcf7f-7087-46f3-9f6a-abdb95bab77f,0


In [18]:
df_predict.source_dataset_l.value_counts()
df_predict.source_dataset_r.value_counts()

hmrc_trade__exporters            76194847
companieshouse_companies          2155706
dit_data_hub__companies           1078847
dit_export_wins__wins_dataset      147565
Name: source_dataset_l, dtype: int64

hmrc_trade__exporters            79091872
companieshouse_companies           196660
dit_export_wins__wins_dataset      186607
dit_data_hub__companies            101826
Name: source_dataset_r, dtype: int64

### Clustering fail diagnosis

We're stepping through `linker.cluster_pairwise_predictions_at_threshold` to see what crashes the kernel. [Source](https://github.com/moj-analytical-services/splink/blob/56833b6fe6692de72530083f51dfdbad29c0fd33/splink/linker.py#L1953).

In [None]:
# linker.cluster_pairwise_predictions_at_threshold(
#     f"'{pred_path.as_posix()}'",
#     threshold_match_probability=0.7,
#     pairwise_formatting=True,
#     filter_pairwise_format_for_clusters=False,
# )

In [10]:
predictions = linker.predict(threshold_match_probability=0.7)

SplinkException: Error executing the following sql for table `__splink__df_concat_with_tf`(__splink__df_concat_with_tf_1b5daab97):
CREATE TABLE __splink__df_concat_with_tf_1b5daab97 AS
(
  WITH __splink__df_concat AS (
    SELECT
      'hmrc_trade__exporters' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/hmrc_trade__exporters.parquet'
    )
    UNION ALL
    SELECT
      'dit_export_wins__wins_dataset' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/dit_export_wins__wins_dataset.parquet'
    )
    UNION ALL
    SELECT
      'dit_data_hub__companies' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/dit_data_hub__companies.parquet'
    )
    UNION ALL
    SELECT
      'predictions' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/predictions.parquet'
    )
    UNION ALL
    SELECT
      'companieshouse_companies' AS source_dataset,
      "postcode_area",
      "unique_id",
      "comp_num_clean",
      "name_unusual_tokens",
      "secondary_name_unusual_tokens",
      "names_tokens_stopwords",
      "postcode",
      "postcode_alt",
      "name_unusual_tokens_first5",
      "name_unusual_tokens_last5",
      "name_sig",
      "name_sig_first5",
      "name_sig_last5"
    FROM READ_PARQUET(
      '/home/jovyan/company_matching/data/processed/company-matching__full/companieshouse_companies.parquet'
    )
  ), __splink__df_tf_comp_num_clean AS (
    SELECT
      "comp_num_clean",
      CAST(COUNT(*) AS DOUBLE) / (
        SELECT
          COUNT("comp_num_clean") AS total
        FROM __splink__df_concat
      ) AS "tf_comp_num_clean"
    FROM __splink__df_concat
    WHERE
      NOT "comp_num_clean" IS NULL
    GROUP BY
      "comp_num_clean"
  ), __splink__df_tf_name_unusual_tokens AS (
    SELECT
      "name_unusual_tokens",
      CAST(COUNT(*) AS DOUBLE) / (
        SELECT
          COUNT("name_unusual_tokens") AS total
        FROM __splink__df_concat
      ) AS "tf_name_unusual_tokens"
    FROM __splink__df_concat
    WHERE
      NOT "name_unusual_tokens" IS NULL
    GROUP BY
      "name_unusual_tokens"
  )
  SELECT
    __splink__df_concat.*,
    __splink__df_tf_comp_num_clean."tf_comp_num_clean",
    __splink__df_tf_name_unusual_tokens."tf_name_unusual_tokens"
  FROM __splink__df_concat
  LEFT JOIN __splink__df_tf_comp_num_clean
    ON __splink__df_concat."comp_num_clean" = __splink__df_tf_comp_num_clean."comp_num_clean"
  LEFT JOIN __splink__df_tf_name_unusual_tokens
    ON __splink__df_concat."name_unusual_tokens" = __splink__df_tf_name_unusual_tokens."name_unusual_tokens"
)

Error was: Binder Error: Referenced column "postcode_area" not found in FROM clause!
Candidate bindings: "read_parquet.match_key"

In [8]:
concat_with_tf = linker._initialise_df_concat_with_tf(df_predict)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
edges_table = _cc_create_unique_id_cols(
    concat_with_tf.physical_name,
    df_predict.physical_name,
    threshold_match_probability,
)

### Postgres fails

In [6]:
pg_con = du.sql_engine.connect()

In [15]:
pg_linker = PostgresLinker(
    input_table_or_tables=du.generate_dummy_df(),
    engine=du.sql_engine,
)
pg_linker.load_model(json_settings)

  res = con.execute(text(final_sql))


ProgrammingError: (psycopg2.errors.InvalidFunctionDefinition) return type mismatch in function declared to return double precision
DETAIL:  Actual return type is numeric.
CONTEXT:  SQL function "ave_months_between"

[SQL: 
        CREATE OR REPLACE FUNCTION ave_months_between(x date, y date)
        RETURNS float8 AS $$
        SELECT datediff(x, y)/30.4375;
        $$ LANGUAGE SQL IMMUTABLE;
        ]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
df_clusters = pg_linker.cluster_pairwise_predictions_at_threshold(
    df_predict, 
    0.7
)

In [20]:
clusters = linker.cluster_pairwise_predictions_at_threshold(
    f"'{pred_path.as_posix()}'",
    threshold_match_probability=0.7,
    pairwise_formatting=True,
    filter_pairwise_format_for_clusters=False,
)

type: 'str' object has no attribute 'physical_name'

In [None]:
lookup = linker.query_sql(
    f"""
    select
        source_dataset_l as source,
        unique_id_l as source_id,
        cluster_id_l as source_cluster,
        source_dataset_r as target,
        unique_id_r as target_id,
        cluster_id_r as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    union
    select
        source_dataset_r as source,
        unique_id_r as source_id,
        cluster_id_r as source_cluster,
        source_dataset_l as target,
        unique_id_l as target_id,
        cluster_id_l as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    """,
    output_type="splink_df",
)