In [2]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import mlflow
import duckdb
import json
from pathlib import Path
import pandas as pd

from splink.duckdb.linker import DuckDBLinker

from cmf.data import utils as du
import cmf.locations as loc
from cmf.config import settings, datasets

DATA_FULL = du.build_alias_path_dict(Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full')
DATA_100K = du.build_alias_path_dict(Path(loc.DATA_SUBDIR['processed']) / 'company-matching__06-26-23_11-40-51')
PRED_PATH = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full' / 'predictions.parquet'

In [4]:
df_ch = du.get_company_data(
    cols=datasets['"companieshouse"."companies"']["cols"],
    dataset='"companieshouse"."companies"',
    where=datasets['"companieshouse"."companies"']["where"],
    sample=100_000,
)
df_dh = du.get_company_data(
    cols=datasets['"dit"."data_hub__companies"']["cols"],
    dataset='"dit"."data_hub__companies"',
    where=datasets['"dit"."data_hub__companies"']["where"],
    sample=100_000,
)
df_ex = du.get_company_data(
    cols=datasets['"hmrc"."trade__exporters"']["cols"],
    dataset='"hmrc"."trade__exporters"',
    where=datasets['"hmrc"."trade__exporters"']["where"],
    sample=100_000,
)
df_ew = du.get_company_data(
    cols=datasets['"dit"."export_wins__wins_dataset"']["cols"],
    dataset='"dit"."export_wins__wins_dataset"',
    where=datasets['"dit"."export_wins__wins_dataset"']["where"],
    sample=100_000,
)

  meta = MetaData(self.connectable, schema=schema)


In [5]:
connection = duckdb.connect()

connection.query(f"""
    create table companieshouse_companies as select * from df_ch;
    create table dit_data_hub__companies as select * from df_dh;
    create table hmrc_trade__exporters as select * from df_ex;
    create table dit_export_wins__wins_dataset as select * from df_ew;
""")

json_raw = mlflow.artifacts.load_text(
    artifact_uri="runs:/22ce217706c54650ac34f59cb6a45960/model/companies_matching_model.json"
)
json_settings = json.loads(json_raw)

linker = DuckDBLinker(
    list(DATA_100K.values()),
    settings_dict=settings,
    connection=connection,
    input_table_aliases=list(DATA_100K.keys()),
)
linker.load_model(json_settings)

I've had a lot of problems with the clsutering parts of Splink, but I wondered if I could use the predictions frame similarly to the lookup I made before.

This notebook is to test that out.

## Production with predictions

Using only the prediction dataframe we need:

* (Dupes) For a given source and list of targets, all IDs that need to be joined on both sides, where the highest pairwise match prediction is the ONLY one that matches 
* (Deduped) As above, PLUS only the top match returned between each pair of tables

Don't forget, because we link and dedupe we also have INTERNAL matches at play.

In [10]:
predictions = linker.predict(threshold_match_probability=.7)

connection.query(f"""
    create table predictions as select * from { predictions.physical_name };
""")


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'comp_num_clean':
    u values not fully trained


In [12]:
connection.query(f"""
    select *
    from predictions
""")

┌────────────────────┬────────────────────┬───┬──────────────────────┬──────────────────────┬───────────┐
│    match_weight    │ match_probability  │ … │     unique_id_l      │     unique_id_r      │ match_key │
│       double       │       double       │   │       varchar        │       varchar        │  varchar  │
├────────────────────┼────────────────────┼───┼──────────────────────┼──────────────────────┼───────────┤
│ 12.844252836761338 │ 0.9998640323566512 │ … │ 99149ffa-ab32-497b…  │ 511cde27-23ef-4a17…  │ 0         │
│ 10.674327835319026 │ 0.9993884366398315 │ … │ 6f61b183-e035-4961…  │ 6c9055b9-75d8-4bf8…  │ 0         │
│ 12.036897914703735 │ 0.9997620808861155 │ … │ 35b26d27-7e2d-e611…  │ fbf48cd3-18fc-420f…  │ 0         │
│ 15.737393039198157 │  0.999981695212484 │ … │ 27c313e0-ec36-e711…  │ 480ee73a-e97d-e311…  │ 0         │
│ 11.036897914703735 │  0.999524274956312 │ … │ f1984abb-a098-e211…  │ e18fdc4d-0c61-45b3…  │ 0         │
│ 13.259290336040182 │ 0.9998980208010205 │ … 

## Production with clusters

This is more or less lifted from WL_splink-test, with the exception that I've attached the raw data to the DuckDB to mimic the Postgres environment better.

I don't think it's quite working as it was before -- the counts on dupe/dedupe come back suspiciously similar. I don't want to spend time fixing it when I think the future is predictions, so just be careful with the below.

In [6]:
predictions = linker.predict(threshold_match_probability=.7)

clusters = linker.cluster_pairwise_predictions_at_threshold(
    predictions,
    threshold_match_probability=.7,
    pairwise_formatting=True,
    filter_pairwise_format_for_clusters=False,
)

lookup = linker.query_sql(
    f"""
    select
        source_dataset_l as source,
        unique_id_l as source_id,
        cluster_id_l as source_cluster,
        source_dataset_r as target,
        unique_id_r as target_id,
        cluster_id_r as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    union
    select
        source_dataset_r as source,
        unique_id_r as source_id,
        cluster_id_r as source_cluster,
        source_dataset_l as target,
        unique_id_l as target_id,
        cluster_id_l as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    """,
    # output_type="splink_df",
)

connection.query(f"""
    create table lookup as select * from lookup;
""")


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'comp_num_clean':
    u values not fully trained
Completed iteration 1, root rows count 27
Completed iteration 2, root rows count 0


In [7]:
join_with_dupes = connection.sql("""
    select
        ch.unique_id,
        ch.company_name as ch_name,
        dh.company_name as dh_name,
        ew.company_name as ew_name
    from (
        select 
            *
        from
            lookup lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
    ) lookup
    right outer join companieshouse_companies ch on
        lookup.source_id = ch.unique_id 
        and lookup.source = 'companieshouse_companies'
    left join dit_data_hub__companies dh on
        lookup.target_id = dh.unique_id 
        and lookup.target = 'dit_data_hub__companies'
    left join dit_export_wins__wins_dataset ew on
        lookup.target_id = ew.unique_id
        and lookup.target = 'dit_export_wins__wins_dataset'
""")

join_with_dupes.df().shape
connection.sql("select * from join_with_dupes limit 5")

(100000, 4)

┌───────────┬───────────────────────────┬───────────────────────────┬─────────┐
│ unique_id │          ch_name          │          dh_name          │ ew_name │
│  varchar  │          varchar          │          varchar          │ varchar │
├───────────┼───────────────────────────┼───────────────────────────┼─────────┤
│ 02453212  │ ST HELENS CHAMBER LIMITED │ ST HELENS CHAMBER LIMITED │ NULL    │
│ 07343391  │ EMPOWER ENERGY LIMITED    │ NULL                      │ NULL    │
│ 07374749  │ AMBREY RISK LIMITED       │ NULL                      │ NULL    │
│ 11109773  │ IONIAN PELLO TECH LIMITED │ IONIAN PELLO TECH LIMITED │ NULL    │
│ 03478491  │ PREMIER PITCHES LIMITED   │ PREMIER PITCHES LIMITED   │ NULL    │
└───────────┴───────────────────────────┴───────────────────────────┴─────────┘

In [8]:
join_no_dupes = connection.sql("""
    select
        ch.unique_id,
        ch.company_name as ch_name,
        dh.company_name as dh_name,
        ew.company_name as ew_name
    from (
        select
            source,
            source_id,
            array_agg(target) as target, 
            array_agg(target_id) as target_id
        from (
            select distinct on (
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster
            )
                *
            from
                lookup lookup
            where
                lookup.source = 'companieshouse_companies'
                and lookup.target in (
                    'dit_data_hub__companies',
                    'dit_export_wins__wins_dataset'
                )
            order by
                lookup.source_id, 
                lookup.target,
                lookup.target_cluster,
                lookup.match_probability desc
        ) lookup
        where
            lookup.source = 'companieshouse_companies'
            and lookup.target in (
                'dit_data_hub__companies',
                'dit_export_wins__wins_dataset'
            )
        group by
            source,
            source_id
    ) lookup
    right join companieshouse_companies ch on
        lookup.source_id = ch.unique_id 
        and lookup.source = 'companieshouse_companies'
    left join dit_data_hub__companies dh on
        array_has(lookup.target_id, dh.unique_id)
        and array_has(lookup.target, 'dit_data_hub__companies')
    left join dit_export_wins__wins_dataset ew on
        array_has(lookup.target_id, ew.unique_id)
        and array_has(lookup.target, 'dit_export_wins__wins_dataset')
""")

join_no_dupes.df().shape
connection.sql("select * from join_no_dupes limit 5")

(100000, 4)

┌───────────┬────────────────────────────────────────────┬────────────────────────────────────────────┬─────────┐
│ unique_id │                  ch_name                   │                  dh_name                   │ ew_name │
│  varchar  │                  varchar                   │                  varchar                   │ varchar │
├───────────┼────────────────────────────────────────────┼────────────────────────────────────────────┼─────────┤
│ 11109773  │ IONIAN PELLO TECH LIMITED                  │ IONIAN PELLO TECH LIMITED                  │ NULL    │
│ 02453212  │ ST HELENS CHAMBER LIMITED                  │ ST HELENS CHAMBER LIMITED                  │ NULL    │
│ 03478491  │ PREMIER PITCHES LIMITED                    │ PREMIER PITCHES LIMITED                    │ NULL    │
│ 08435515  │ THE ROYAL BUCKINGHAMSHIRE HOSPITAL LIMITED │ THE ROYAL BUCKINGHAMSHIRE HOSPITAL LIMITED │ NULL    │
│ 07343391  │ EMPOWER ENERGY LIMITED                     │ NULL                         