In [2]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [41]:
from pathlib import Path

from splink.duckdb.linker import DuckDBLinker

from src.data import utils as du
import src.locations as loc
from src.config import settings

CLUSTER_PATH = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full' / 'clusters.parquet' 

# Using Splink with physical duckdb

Gonna try and run it off the file system. Raw db about 1GB pre-Splink.

In [4]:
con = du.get_duckdb_connection()

In [16]:
table_name = []
table_alias = []

for i in con.query("select * from table_alias_lookup;").fetchall():
    table_alias.append(i[0])
    table_name.append(i[1])

In [17]:
linker = DuckDBLinker(
    table_name,
    settings_dict=settings,
    connection=con,
    input_table_aliases=table_alias,
)

## Train

In [22]:
linker.estimate_probability_two_random_records_match(
    "l.name_unusual_tokens = r.name_unusual_tokens",
    recall=0.7,
)

Probability two random records match is estimated to be  3.24e-06.
This means that amongst all possible pairwise record comparisons, one in 309,025.51 are expected to match.  With 40,009,433,095,801 total possible comparisons, we expect a total of around 129,469,675.71 matching pairs


In [23]:
linker.estimate_u_using_random_sampling(max_pairs=1e7)

----- Estimating u probabilities using random sampling -----
u probability not trained for comp_num_clean - Exact match (comparison vector value: 2). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - comp_num_clean (some u values are not trained, no m values are trained).
    - name_unusual_tokens (no m values are trained).
    - postcode (no m values are trained).


In [24]:
linker.estimate_m_from_label_column("comp_num_clean")
m_by_name_and_postcode_area = """
    l.name_unusual_tokens = r.name_unusual_tokens
    and l.postcode_area = r.postcode_area
"""
linker.estimate_parameters_using_expectation_maximisation(
    m_by_name_and_postcode_area
)

---- Estimating m probabilities using from column comp_num_clean -----
m probability not trained for comp_num_clean - Jaro_winkler_similarity >= 0.75 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.
m probability not trained for comp_num_clean - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.

Your model is not yet fully trained. Missing estimates for:
    - comp_num_clean (some u values are not trained, some m values are not trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:

    l.name_unusual_tokens = r.name_unusual_tokens
    and l.postcode_area = r.postcode_area


Parameter estimates will be made for the following comparison(s):
    - comp_num_clean
    - postcode

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
 

<EMTrainingSession, blocking on 
    l.name_unusual_tokens = r.name_unusual_tokens
    and l.postcode_area = r.postcode_area
, deactivating comparisons name_unusual_tokens>

## Predict

In [25]:
predictions = linker.predict(threshold_match_probability=0.7)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'comp_num_clean':
    u values not fully trained


## Cluster

In [22]:
predict_table = con.query("""
    select table_name
    from information_schema.tables
    where table_name like '%predict%';
""").fetchone()[0]
predictions = linker.register_table(predict_table, predict_table)

In [24]:
clusters = linker.cluster_pairwise_predictions_at_threshold(
    predictions,
    threshold_match_probability=0.7,
    pairwise_formatting=True,
    filter_pairwise_format_for_clusters=False,
)

Completed iteration 1, root rows count 11876
Completed iteration 2, root rows count 208
Completed iteration 3, root rows count 97
Completed iteration 4, root rows count 3
Completed iteration 5, root rows count 0


In [30]:
clusters.physical_name

'__splink__df_representatives_ea153478b'

## Review

In [20]:
con.query(f"select count(*) from {predict_table};")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     79578102 │
└──────────────┘

In [31]:
con.query(f"select count(*) from {clusters.physical_name};")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     79578102 │
└──────────────┘

In [25]:
con.query("pragma database_size;")

┌──────────────────┬───────────────┬────────────┬───┬─────────────┬──────────┬──────────────┬──────────────┐
│  database_name   │ database_size │ block_size │ … │ free_blocks │ wal_size │ memory_usage │ memory_limit │
│     varchar      │    varchar    │   int64    │   │    int64    │ varchar  │   varchar    │   varchar    │
├──────────────────┼───────────────┼────────────┼───┼─────────────┼──────────┼──────────────┼──────────────┤
│ company_matching │ 10.0GB        │     262144 │ … │        1365 │ 0 bytes  │ 5.6GB        │ 26.4GB       │
├──────────────────┴───────────────┴────────────┴───┴─────────────┴──────────┴──────────────┴──────────────┤
│ 1 rows                                                                               9 columns (7 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [27]:
con.query("""
    select table_name
    from information_schema.tables;
""")

┌────────────────────────────────────────────────┐
│                   table_name                   │
│                    varchar                     │
├────────────────────────────────────────────────┤
│ __splink__df_representatives_ea153478b         │
│ __splink__df_representatives_5_d028e6a58       │
│ __splink__df_neighbours_0591198d8              │
│ __splink__df_connected_components_df_f10684aa6 │
│ table_alias_lookup                             │
│ __splink__df_connected_components_df_c16c235f8 │
│ hmrc_trade__exporters                          │
│ __splink__df_predict_d21134292                 │
│ companieshouse_companies                       │
│ dit_data_hub__companies                        │
│ dit_export_wins__wins_dataset                  │
│ __splink__df_representatives_4_7613b3b70       │
│ __splink__df_representatives_5_7003110e3       │
│ __splink__df_concat_b79089d01                  │
│ __splink__df_concat_with_tf_4034084f6          │
│ __splink__m_u_counts_e96a5fb7

## Export

In [42]:
con.query(f"""
    copy (
        select
            src_tbl.table_name as source,
            src_id.unique_id as source_id,
            cl.source_cluster,
            tgt_tbl.table_name as target,
            tgt_id.unique_id as target_id,
            cl.target_cluster
        from (
            select
                source_dataset_l as source,
                unique_id_l as source_id,
                cluster_id_l as source_cluster,
                source_dataset_r as target,
                unique_id_r as target_id,
                cluster_id_r as target_cluster,
                match_probability
            from
                { clusters.physical_name }
            union
            select
                source_dataset_r as source,
                unique_id_r as source_id,
                cluster_id_r as source_cluster,
                source_dataset_l as target,
                unique_id_l as target_id,
                cluster_id_l as target_cluster,
                match_probability
            from
                { clusters.physical_name }
        ) cl
        join table_alias_lookup src_tbl on
            (cl.source = src_tbl.id)
        join unique_id_lookup src_id on
            (cl.source_id = src_id.id)
        join table_alias_lookup tgt_tbl on
            (cl.target = tgt_tbl.id)
        join unique_id_lookup tgt_id on
            (cl.target_id = tgt_id.id)
    )
    to '{CLUSTER_PATH.as_posix()}'
    (format parquet);
""")