In [2]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import mlflow
import duckdb
import json
from pathlib import Path
import dask.dataframe as dd

from splink.duckdb.linker import DuckDBLinker

from src.data import utils as du
import src.locations as loc
from src.config import settings

Questions:

1. Can we predict in batches?
2. If we predict in batches, do we get the same answer as when not?
3. Does this alleviate memory issues?
4. Does this work with clustering?

29/7 update: didn't even need to do batch stuff. CLUSTERING is the problem -- predict is fine. Opens up new avenues.

## Repartition into multiple files

In [4]:
data_full = du.build_alias_path_dict(Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full')

In [47]:
data_partitioned = {}

for data in data_full.keys():
    df = dd.read_parquet(data_full[data])
    df = df.repartition(partition_size="100MB")
    new_dir = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__partitioned' / data
    df.to_parquet(new_dir)
    data_partitioned[data] = f"'{new_dir.as_posix()}'"
    
data_partitioned

{'hmrc_trade__exporters': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/hmrc_trade__exporters'",
 'dit_export_wins__wins_dataset': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_export_wins__wins_dataset'",
 'dit_data_hub__companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_data_hub__companies'",
 'companieshouse_companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/companieshouse_companies'"}

## Predict in batches

In [5]:
json_raw = mlflow.artifacts.load_text(
    artifact_uri="runs:/22ce217706c54650ac34f59cb6a45960/model/companies_matching_model.json"
)
json_settings = json.loads(json_raw)

In [6]:
connection = duckdb.connect()

In [7]:
linker = DuckDBLinker(
    list(data_full.values()),
    settings_dict=settings,
    connection=connection,
    input_table_aliases=list(data_full.keys()),
)
linker.load_model(json_settings)

In [8]:
predictions = linker.predict(threshold_match_probability=0.7)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'comp_num_clean':
    u values not fully trained


In [10]:
connection.query("""
    pragma database_size;
    call pragma_database_size();
""")

┌───────────────┬───────────────┬────────────┬──────────────┬───┬─────────────┬──────────┬──────────────┬──────────────┐
│ database_name │ database_size │ block_size │ total_blocks │ … │ free_blocks │ wal_size │ memory_usage │ memory_limit │
│    varchar    │    varchar    │   int64    │    int64     │   │    int64    │ varchar  │   varchar    │   varchar    │
├───────────────┼───────────────┼────────────┼──────────────┼───┼─────────────┼──────────┼──────────────┼──────────────┤
│ memory        │ 0 bytes       │          0 │            0 │ … │           0 │ 0 bytes  │ 11.0GB       │ 26.4GB       │
├───────────────┴───────────────┴────────────┴──────────────┴───┴─────────────┴──────────┴──────────────┴──────────────┤
│ 1 rows                                                                                           9 columns (8 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [17]:
pred_path = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full' / 'predictions.parquet'

In [18]:
connection.query(f"""
    copy {predictions.physical_name}
    to '{pred_path.as_posix()}'
    (format parquet);
""")

In [19]:
connection.query(f"""
    select *
    from '{pred_path.as_posix()}'
""")

┌────────────────────┬────────────────────┬───┬──────────────────────┬──────────────────────┬───────────┐
│    match_weight    │ match_probability  │ … │     unique_id_l      │     unique_id_r      │ match_key │
│       double       │       double       │   │       varchar        │       varchar        │  varchar  │
├────────────────────┼────────────────────┼───┼──────────────────────┼──────────────────────┼───────────┤
│ 1.4157836817616756 │ 0.7273753031699655 │ … │ b18c76fc-a30d-e411…  │ 11f864f6-09bc-4cf8…  │ 0         │
│ 20.075070679022463 │ 0.9999990946819749 │ … │ 7a7fd6b2-4f0e-e411…  │ 19a1c784-0e8c-4b9b…  │ 0         │
│ 18.852678257686016 │ 0.9999978875938246 │ … │ e7275be8-7e11-e411…  │ 3d895056-4ffd-4a15…  │ 0         │
│ 21.338105084856256 │  0.999999622783957 │ … │ 6afa126b-a911-e411…  │ 03ebd8a5-f065-423a…  │ 0         │
│  18.20060156110632 │ 0.9999966805085885 │ … │ f7f4ddb5-4d12-e411…  │ 600dcf7f-7087-46f3…  │ 0         │
│ 22.075070679022463 │ 0.9999997736703401 │ … 

In [20]:
clusters = linker.cluster_pairwise_predictions_at_threshold(
    f"'{pred_path.as_posix()}'",
    threshold_match_probability=0.7,
    pairwise_formatting=True,
    filter_pairwise_format_for_clusters=False,
)

type: 'str' object has no attribute 'physical_name'

In [None]:
lookup = linker.query_sql(
    f"""
    select
        source_dataset_l as source,
        unique_id_l as source_id,
        cluster_id_l as source_cluster,
        source_dataset_r as target,
        unique_id_r as target_id,
        cluster_id_r as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    union
    select
        source_dataset_r as source,
        unique_id_r as source_id,
        cluster_id_r as source_cluster,
        source_dataset_l as target,
        unique_id_l as target_id,
        cluster_id_l as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    """,
    output_type="splink_df",
)