In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
import mlflow
import duckdb
import json
from pathlib import Path
import pandas as pd
import time
import networkx as nx
import sys
# import dask.dataframe as dd

from splink.duckdb.linker import DuckDBLinker
from splink.postgres.linker import PostgresLinker
from splink.connected_components import (
    _cc_create_unique_id_cols,
    solve_connected_components,
    _cc_create_nodes_table,
    _cc_generate_neighbours_representation,
    _cc_generate_initial_representatives_table,
    _cc_update_neighbours_first_iter,
    _cc_update_representatives_first_iter,
    _cc_generate_representatives_loop_cond,
    _cc_update_representatives_loop_cond
)

from src.data import utils as du
import src.locations as loc
from src.config import settings

DATA_FULL = du.build_alias_path_dict(Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full')
del DATA_FULL['predictions']
PRED_PATH = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__full' / 'predictions.parquet'

Questions:

1. Can we predict in batches?
2. If we predict in batches, do we get the same answer as when not?
3. Does this alleviate memory issues?
4. Does this work with clustering?

29/6 update: didn't even need to do batch stuff. CLUSTERING is the problem -- predict is fine. Opens up new avenues.

Let's see where clustering fails specifically.

30/6: [This could be an option](https://github.com/moj-analytical-services/splink/discussions/1218). Predict in a glob, cluster in batches. Robin's assumptions all hold in our use case. The model is fixed because this is batching one run, records are only added because we're batching one set of predictions, and the records don't change because this is batching one run.

## Repartition into multiple files

In [47]:
data_partitioned = {}

for data in data_full.keys():
    df = dd.read_parquet(data_full[data])
    df = df.repartition(partition_size="100MB")
    new_dir = Path(loc.DATA_SUBDIR['processed']) / 'company-matching__partitioned' / data
    df.to_parquet(new_dir)
    data_partitioned[data] = f"'{new_dir.as_posix()}'"
    
data_partitioned

{'hmrc_trade__exporters': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/hmrc_trade__exporters'",
 'dit_export_wins__wins_dataset': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_export_wins__wins_dataset'",
 'dit_data_hub__companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/dit_data_hub__companies'",
 'companieshouse_companies': "'/home/jovyan/company_matching/data/processed/company-matching__partitioned/companieshouse_companies'"}

## Generate predictions and stash

In [3]:
json_raw = mlflow.artifacts.load_text(
    artifact_uri="runs:/22ce217706c54650ac34f59cb6a45960/model/companies_matching_model.json"
)
json_settings = json.loads(json_raw)

In [29]:
connection = duckdb.connect()

In [30]:
linker = DuckDBLinker(
    list(DATA_FULL.values()),
    settings_dict=settings,
    connection=connection,
    input_table_aliases=list(DATA_FULL.keys()),
)
linker.load_model(json_settings)

In [9]:
predictions = linker.predict(threshold_match_probability=0.7)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'comp_num_clean':
    u values not fully trained


In [10]:
connection.query("""
    pragma database_size;
    call pragma_database_size();
""")

┌───────────────┬───────────────┬────────────┬──────────────┬───┬─────────────┬──────────┬──────────────┬──────────────┐
│ database_name │ database_size │ block_size │ total_blocks │ … │ free_blocks │ wal_size │ memory_usage │ memory_limit │
│    varchar    │    varchar    │   int64    │    int64     │   │    int64    │ varchar  │   varchar    │   varchar    │
├───────────────┼───────────────┼────────────┼──────────────┼───┼─────────────┼──────────┼──────────────┼──────────────┤
│ memory        │ 0 bytes       │          0 │            0 │ … │           0 │ 0 bytes  │ 11.0GB       │ 26.4GB       │
├───────────────┴───────────────┴────────────┴──────────────┴───┴─────────────┴──────────┴──────────────┴──────────────┤
│ 1 rows                                                                                           9 columns (8 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [18]:
connection.query(f"""
    copy {predictions.physical_name}
    to '{PRED_PATH.as_posix()}'
    (format parquet);
""")

In [10]:
connection.query(f"""
    select *
    from '{PRED_PATH.as_posix()}'
""")

┌────────────────────┬────────────────────┬───┬──────────────────────┬──────────────────────┬───────────┐
│    match_weight    │ match_probability  │ … │     unique_id_l      │     unique_id_r      │ match_key │
│       double       │       double       │   │       varchar        │       varchar        │  varchar  │
├────────────────────┼────────────────────┼───┼──────────────────────┼──────────────────────┼───────────┤
│ 1.4157836817616756 │ 0.7273753031699655 │ … │ b18c76fc-a30d-e411…  │ 11f864f6-09bc-4cf8…  │ 0         │
│ 20.075070679022463 │ 0.9999990946819749 │ … │ 7a7fd6b2-4f0e-e411…  │ 19a1c784-0e8c-4b9b…  │ 0         │
│ 18.852678257686016 │ 0.9999978875938246 │ … │ e7275be8-7e11-e411…  │ 3d895056-4ffd-4a15…  │ 0         │
│ 21.338105084856256 │  0.999999622783957 │ … │ 6afa126b-a911-e411…  │ 03ebd8a5-f065-423a…  │ 0         │
│  18.20060156110632 │ 0.9999966805085885 │ … │ f7f4ddb5-4d12-e411…  │ 600dcf7f-7087-46f3…  │ 0         │
│ 22.075070679022463 │ 0.9999997736703401 │ … 

In [None]:
du.data_workspace_write(
    schema = "_user_eaf4fd9a",
    table = "lge_all_predictions",
    df = pd.read_parquet(PRED_PATH),
    if_exists = "replace",
    chunksize = int(1e6)
)

## Fix clustering

In [4]:
df_predict = pd.read_parquet(PRED_PATH)

In [4]:
linker.query_sql(f"select * from {db_predict.physical_name}")

NameError: name 'linker' is not defined

In [18]:
df_predict.source_dataset_l.value_counts()
df_predict.source_dataset_r.value_counts()

hmrc_trade__exporters            76194847
companieshouse_companies          2155706
dit_data_hub__companies           1078847
dit_export_wins__wins_dataset      147565
Name: source_dataset_l, dtype: int64

hmrc_trade__exporters            79091872
companieshouse_companies           196660
dit_export_wins__wins_dataset      186607
dit_data_hub__companies            101826
Name: source_dataset_r, dtype: int64

### NetworkX

DuckDB is performing a graph operation on a relational database -- no wonder it's running out of memory. This approach makes sense when you've got a cluster to play with and want to keep stuff SQL first, but we don't and don't.

Splink checks its connected components clustering using `networkx`. Let's try promoting it to our preferred method.

A problem I can see emerging in this method is when the unique ID of one table is (possibly by chance) the same as a unique ID in another. I believe this is either quite likely, when company ID has been used, or almost impossible, when it's a UUID. But it needs checking and I haven't done it yet.

In [6]:
df_predict.head(3)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key
0,1.415784,0.727375,dit_data_hub__companies,dit_export_wins__wins_dataset,b18c76fc-a30d-e411-8a2b-e4115bead28a,11f864f6-09bc-4cf8-969e-ae790c28aec7,0
1,20.075071,0.999999,dit_data_hub__companies,dit_export_wins__wins_dataset,7a7fd6b2-4f0e-e411-8a2b-e4115bead28a,19a1c784-0e8c-4b9b-b40f-f4daa5d9bd01,0
2,18.852678,0.999998,dit_data_hub__companies,dit_export_wins__wins_dataset,e7275be8-7e11-e411-8a2b-e4115bead28a,3d895056-4ffd-4a15-91d3-05e6def6e606,0


#### First try

In [5]:
G = nx.from_pandas_edgelist(
    df = df_predict.sample(1_000_000),
    source = 'unique_id_l',
    target = 'unique_id_r',
    edge_attr = 'match_probability'
)

In [13]:
rows = []

for cluster in nx.connected_components(G):
    m = min(list(cluster))
    for n in cluster:
        row = {"node_id": n, "representative": m}
        rows.append(row)
        
clusters = pd.DataFrame(rows)

clusters.head(3)

Unnamed: 0,node_id,representative
0,6147b00b-0b1f-4941-ad03-8589005229ba,018de358-52d9-4df8-aa53-7e9b068c7237
1,7cd96a12-4ac8-4a58-bd4c-5649edea565c,018de358-52d9-4df8-aa53-7e9b068c7237
2,018de358-52d9-4df8-aa53-7e9b068c7237,018de358-52d9-4df8-aa53-7e9b068c7237


#### More advanced -- add attributes

In [14]:
df_sample = df_predict.sample(1_000_000)

In [53]:
G = nx.from_pandas_edgelist(
    df = df_sample,
    source = 'unique_id_l',
    target = 'unique_id_r',
    edge_attr = 'match_probability'
)

In [52]:
attr_l = (
    df_sample[['unique_id_l', 'source_dataset_l']]
    .rename(columns={
        'unique_id_l': 'unique_id',
        'source_dataset_l': 'source_dataset'
    })
)
attr_r = (
    df_sample[['unique_id_r', 'source_dataset_r']]
    .rename(columns={
        'unique_id_r': 'unique_id',
        'source_dataset_r': 'source_dataset'
    })
)
attr_all = (
    pd.concat([attr_r, attr_r])
    .drop_duplicates()
    # .groupby('unique_id')
    # .agg(lambda x: x.tolist())
    # .to_dict('index')
)
attr_dict = (
    pd.crosstab(
        attr_all.unique_id, 
        attr_all.source_dataset
    )
    .astype(bool)
    .to_dict('index')
)

In [54]:
nx.set_node_attributes(G, attr_dict)

In [81]:
nodes = (
    node
    for node, data
    in G.nodes(data=True)
    if data.get("dit_data_hub__companies") or data.get("companieshouse_companies")
)
subgraph = G.subgraph(nodes)

In [84]:
dict(subgraph.nodes(data=True))['CS002474']

{'companieshouse_companies': True,
 'dit_data_hub__companies': False,
 'dit_export_wins__wins_dataset': False,
 'hmrc_trade__exporters': False}

In [39]:
cc = next(nx.connected_components(G))

In [40]:
cc

{'018de358-52d9-4df8-aa53-7e9b068c7237',
 '6147b00b-0b1f-4941-ad03-8589005229ba',
 '7cd96a12-4ac8-4a58-bd4c-5649edea565c'}

In [43]:
nx.node_connected_component(G,'6147b00b-0b1f-4941-ad03-8589005229ba')

{'018de358-52d9-4df8-aa53-7e9b068c7237',
 '6147b00b-0b1f-4941-ad03-8589005229ba',
 '7cd96a12-4ac8-4a58-bd4c-5649edea565c'}

In [41]:
rows = []

for cluster in nx.connected_components(G):
    m = min(list(cluster))
    for n in cluster:
        row = {"node_id": n, "representative": m}
        rows.append(row)
        
clusters = pd.DataFrame(rows)

clusters.head(3)

Unnamed: 0,node_id,representative
0,6147b00b-0b1f-4941-ad03-8589005229ba,018de358-52d9-4df8-aa53-7e9b068c7237
1,7cd96a12-4ac8-4a58-bd4c-5649edea565c,018de358-52d9-4df8-aa53-7e9b068c7237
2,018de358-52d9-4df8-aa53-7e9b068c7237,018de358-52d9-4df8-aa53-7e9b068c7237


### Iterative clustering

See top of file. We're going to cluster this iteratively, then combine them at the end, because [these assumptions hold](https://github.com/moj-analytical-services/splink/discussions/1218).

In [33]:
df_predict_to_sample = df_predict

In [None]:
clusters = []

for i in range(2):
    print(f"Cluster {i}")
    print(f"df_predict_to_sample shape: {df_predict_to_sample.shape}")
          
    predict_sample = df_predict_to_sample.sample(100_000)
    df_predict_to_sample = df_predict_to_sample.drop(predict_sample.index)
          
    print("Sampling complete")
    print(f"predict_sample shape: {predict_sample.shape}")
    print(f"df_predict_to_sample shape: {df_predict_to_sample.shape}")
    
    linker = DuckDBLinker(
        list(DATA_FULL.values()),
        settings_dict=settings,
        connection=':memory:',
        input_table_aliases=list(DATA_FULL.keys()),
    )
    
    linker.load_model(json_settings)

    db_predict = linker.register_table(
        predict_sample, 
        "__splink__df_predict",
        overwrite=True
    )

    clusters_sample = linker.cluster_pairwise_predictions_at_threshold(
        db_predict,
        threshold_match_probability=0.7,
        pairwise_formatting=True,
        filter_pairwise_format_for_clusters=False,
    )
    
    clusters.append(clusters_sample.as_pandas_dataframe())
    
clusters

Cluster 0
df_predict_to_sample shape: (79576965, 7)
Sampling complete
predict_sample shape: (100000, 7)
df_predict_to_sample shape: (79476965, 7)


In [17]:
clusters[0].head(3)
clusters[1].head(3)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key,cluster_id_l,cluster_id_r
0,1.83273,0.780803,companieshouse_companies,companieshouse_companies,38751,FC038751,0,companieshouse_companies-__-00038751,companieshouse_companies-__-00038751
1,7.753164,0.995386,companieshouse_companies,hmrc_trade__exporters,43694,1830964,1,companieshouse_companies-__-00043694,companieshouse_companies-__-00043694
2,7.737223,0.995335,companieshouse_companies,hmrc_trade__exporters,45916,2410810,1,companieshouse_companies-__-00045916,companieshouse_companies-__-00045916


Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key,cluster_id_l,cluster_id_r
0,1.83273,0.780803,companieshouse_companies,companieshouse_companies,38751,FC038751,0,companieshouse_companies-__-00038751,companieshouse_companies-__-00038751
1,7.753164,0.995386,companieshouse_companies,hmrc_trade__exporters,43694,1830964,1,companieshouse_companies-__-00043694,companieshouse_companies-__-00043694
2,7.737223,0.995335,companieshouse_companies,hmrc_trade__exporters,45916,2410810,1,companieshouse_companies-__-00045916,companieshouse_companies-__-00045916


In [11]:
linker.query_sql(f"""
    select * from {clusters[0].physical_name} limit 5
""")
linker.query_sql(f"""
    select * from {clusters[1].physical_name} limit 5
""")

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key,cluster_id_l,cluster_id_r
0,1.83273,0.780803,companieshouse_companies,companieshouse_companies,38751,FC038751,0,companieshouse_companies-__-00038751,companieshouse_companies-__-00038751
1,7.753164,0.995386,companieshouse_companies,hmrc_trade__exporters,43694,1830964,1,companieshouse_companies-__-00043694,companieshouse_companies-__-00043694
2,7.737223,0.995335,companieshouse_companies,hmrc_trade__exporters,45916,2410810,1,companieshouse_companies-__-00045916,companieshouse_companies-__-00045916
3,8.517097,0.997278,companieshouse_companies,hmrc_trade__exporters,48860,2505504,1,companieshouse_companies-__-00048860,companieshouse_companies-__-00048860
4,8.660055,0.997534,companieshouse_companies,hmrc_trade__exporters,49371,1039169,1,companieshouse_companies-__-00049371,companieshouse_companies-__-00049371


Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,match_key,cluster_id_l,cluster_id_r
0,1.83273,0.780803,companieshouse_companies,companieshouse_companies,38751,FC038751,0,companieshouse_companies-__-00038751,companieshouse_companies-__-00038751
1,7.753164,0.995386,companieshouse_companies,hmrc_trade__exporters,43694,1830964,1,companieshouse_companies-__-00043694,companieshouse_companies-__-00043694
2,7.737223,0.995335,companieshouse_companies,hmrc_trade__exporters,45916,2410810,1,companieshouse_companies-__-00045916,companieshouse_companies-__-00045916
3,8.517097,0.997278,companieshouse_companies,hmrc_trade__exporters,48860,2505504,1,companieshouse_companies-__-00048860,companieshouse_companies-__-00048860
4,8.660055,0.997534,companieshouse_companies,hmrc_trade__exporters,49371,1039169,1,companieshouse_companies-__-00049371,companieshouse_companies-__-00049371


### Clustering fail diagnosis

We're stepping through `linker.cluster_pairwise_predictions_at_threshold` to see what crashes the kernel. [Source](https://github.com/moj-analytical-services/splink/blob/56833b6fe6692de72530083f51dfdbad29c0fd33/splink/linker.py#L1953).

In [None]:
# linker.cluster_pairwise_predictions_at_threshold(
#     f"'{pred_path.as_posix()}'",
#     threshold_match_probability=0.7,
#     pairwise_formatting=True,
#     filter_pairwise_format_for_clusters=False,
# )

In [10]:
concat_with_tf = linker._initialise_df_concat_with_tf(predictions)

In [11]:
edges_table = _cc_create_unique_id_cols(
    linker,
    concat_with_tf.physical_name,
    predictions.physical_name,
    0.7,
)

Crashed in the connected components function. Let's break it down.

In [12]:
# cc = solve_connected_components(
#     linker,
#     edges_table,
#     predictions,
#     concat_with_tf,
#     pairwise_output = True,
#     filter_pairwise_format_for_clusters = False,
# )

This is within `solve_connected_components`.

In [13]:
input_dfs = [edges_table]
input_dfs.append(concat_with_tf)

In [14]:
# Create our initial node and neighbours tables
sql = _cc_create_nodes_table(linker, False)
linker._enqueue_sql(sql, "nodes")
sql = _cc_generate_neighbours_representation()
linker._enqueue_sql(sql, "__splink__df_neighbours")
neighbours = linker._execute_sql_pipeline(input_dfs)

In [12]:
# # Create our initial representatives table
# sql = _cc_generate_initial_representatives_table()
# linker._enqueue_sql(sql, "representatives")
# sql = _cc_update_neighbours_first_iter()
# linker._enqueue_sql(sql, "neighbours_first_iter")
# sql = _cc_update_representatives_first_iter()
# # Execute if we have no batching, otherwise add it to our batched process
# linker._enqueue_sql(sql, "__splink__df_representatives")

And here's our crash. Let's try running it sequentially, which I hope will work?

In [None]:
# representatives = linker._execute_sql_pipeline([neighbours])

Let's try.

In [15]:
sql = _cc_generate_initial_representatives_table()
linker._enqueue_sql(sql, "representatives")
step_1 = linker._execute_sql_pipeline([neighbours])

In [21]:
sql = _cc_update_neighbours_first_iter()
linker._enqueue_sql(sql, "neighbours_first_iter")
step_2 = linker._execute_sql_pipeline([step_1, neighbours])

In [24]:
sql = _cc_update_representatives_first_iter()
# Execute if we have no batching, otherwise add it to our batched process
linker._enqueue_sql(sql, "__splink__df_representatives")
representatives = linker._execute_sql_pipeline([step_2, step_1, neighbours])

In [34]:
linker.query_sql(f"""
    select * from {representatives.physical_name} limit 5
""")

Unnamed: 0,node_id,representative,rep_match
0,companieshouse_companies-__-07258900,companieshouse_companies-__-07258900,False
1,companieshouse_companies-__-07259140,companieshouse_companies-__-07259140,False
2,companieshouse_companies-__-07252840,companieshouse_companies-__-07252840,False
3,companieshouse_companies-__-07253048,companieshouse_companies-__-07253048,False
4,companieshouse_companies-__-07258011,companieshouse_companies-__-07258011,False


In [None]:
sql = _cc_generate_initial_representatives_table()
linker._enqueue_sql(sql, "representatives")
sql = _cc_update_neighbours_first_iter()
linker._enqueue_sql(sql, "neighbours_first_iter")
step_2 = linker._execute_sql_pipeline([step_1, neighbours])

In [31]:
prev_representatives_table = representatives

In [35]:
representatives.drop_table_from_database_and_remove_from_cache()

AttributeError: 'DuckDBLinkerDataFrame' object has no attribute 'drop_table_from_database_and_remove_from_cache'

In [32]:
prev_representatives_table.drop_table_from_database_and_remove_from_cache()

AttributeError: 'DuckDBLinkerDataFrame' object has no attribute 'drop_table_from_database_and_remove_from_cache'

In [28]:
# Loop while our representative table still has unsettled nodes
iteration, root_rows = 0, 1
while root_rows > 0:
    start_time = time.time()
    iteration += 1

    # Loop summary:

    # 1. Update our neighbours table.
    # 2. Join on the representatives table from the previous iteration
    #    to create the "rep_match" column.
    # 3. Assess if our exit condition has been met.

    # Generates our representatives table for the next iteration
    # by joining our previous tables onto our neighbours table.
    sql = _cc_generate_representatives_loop_cond(
        prev_representatives_table.physical_name,
    )
    linker._enqueue_sql(sql, "r")
    # Update our rep_match column in the representatives table.
    sql = _cc_update_representatives_loop_cond(
        prev_representatives_table.physical_name
    )

    repr_name = f"__splink__df_representatives_{iteration}"

    representatives = linker._enqueue_sql(
        sql,
        repr_name,
    )

    representatives = linker._execute_sql_pipeline([neighbours])
    # Update table reference
    prev_representatives_table.drop_table_from_database_and_remove_from_cache()
    prev_representatives_table = representatives

    # Check if our exit condition has been met...
    sql = _cc_assess_exit_condition(representatives.physical_name)

    linker._enqueue_sql(sql, "__splink__df_root_rows")

    root_rows_df = linker._execute_sql_pipeline(use_cache=False)

    root_rows = root_rows_df.as_record_dict()
    root_rows_df.drop_table_from_database_and_remove_from_cache()
    root_rows = root_rows[0]["count"]
    logger.info(f"Completed iteration {iteration}, root rows count {root_rows}")
    end_time = time.time()
    logger.log(15, f"    Iteration time: {end_time - start_time} seconds")

AttributeError: 'DuckDBLinkerDataFrame' object has no attribute 'drop_table_from_database_and_remove_from_cache'

In [29]:
linker.drop_table_from_database_and_remove_from_cache

AttributeError: 'DuckDBLinker' object has no attribute 'drop_table_from_database_and_remove_from_cache'

### Postgres fails

In [6]:
pg_con = du.sql_engine.connect()

In [15]:
pg_linker = PostgresLinker(
    input_table_or_tables=du.generate_dummy_df(),
    engine=du.sql_engine,
)
pg_linker.load_model(json_settings)

  res = con.execute(text(final_sql))


ProgrammingError: (psycopg2.errors.InvalidFunctionDefinition) return type mismatch in function declared to return double precision
DETAIL:  Actual return type is numeric.
CONTEXT:  SQL function "ave_months_between"

[SQL: 
        CREATE OR REPLACE FUNCTION ave_months_between(x date, y date)
        RETURNS float8 AS $$
        SELECT datediff(x, y)/30.4375;
        $$ LANGUAGE SQL IMMUTABLE;
        ]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
df_clusters = pg_linker.cluster_pairwise_predictions_at_threshold(
    df_predict, 
    0.7
)

In [20]:
clusters = linker.cluster_pairwise_predictions_at_threshold(
    f"'{pred_path.as_posix()}'",
    threshold_match_probability=0.7,
    pairwise_formatting=True,
    filter_pairwise_format_for_clusters=False,
)

type: 'str' object has no attribute 'physical_name'

In [None]:
lookup = linker.query_sql(
    f"""
    select
        source_dataset_l as source,
        unique_id_l as source_id,
        cluster_id_l as source_cluster,
        source_dataset_r as target,
        unique_id_r as target_id,
        cluster_id_r as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    union
    select
        source_dataset_r as source,
        unique_id_r as source_id,
        cluster_id_r as source_cluster,
        source_dataset_l as target,
        unique_id_l as target_id,
        cluster_id_l as target_cluster,
        match_probability
    from
        { clusters.physical_name }
    """,
    output_type="splink_df",
)