In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame
InteractiveShell.ast_node_interactivity = "all"

# Companies House x HMRC exporters

I want to build this in a way that one can improve a link pair in a notebook, then deploy those changes to the link easily. I'm going to play with this idea here.

In [29]:
from src.data import utils as du
from src.models import utils as mu
from src.config import tables, stopwords
from src.features.clean_complex import clean_comp_names
from src.link.make_link import LinkDatasets

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

# import os
import logging
import mlflow
from functools import partial
from dotenv import load_dotenv, find_dotenv
import json

In [30]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "id",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ]
}

In [31]:
pipeline = {
    "estimate_probability_two_random_records_match": {
        "function": "estimate_probability_two_random_records_match",
        "arguments": {
            "deterministic_matching_rules": """
                l.name_unusual_tokens = r.name_unusual_tokens
            """,
            "recall": 0.7  
        }
    },
    "estimate_u_using_random_sampling": {
        "function": "estimate_u_using_random_sampling",
        "arguments": {
            "max_pairs": 1e6
        }
    },
    "estimate_parameters_using_expectation_maximisation": {
        "function": "estimate_parameters_using_expectation_maximisation",
        "arguments": {
            "blocking_rule": """
                l.name_unusual_tokens = r.name_unusual_tokens
            """
        }
    }
}

In [32]:
ch_settings = {
    "name": '"companieshouse"."companies"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords
            }
        }
    }
}

In [33]:
exp_settings = {
    "name": '"hmrc"."trade__exporters"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords
            }
        }
    }
}

## Running this as an MLflow experiment

In [41]:
ch_x_exp = LinkDatasets(
    table_l = ch_settings,
    table_r = exp_settings,
    settings = settings,
    pipeline = pipeline
)

In [42]:
ch_x_exp.run_mlflow_experiment(
    run_name="Basic linkage",
    description="""
        - Unusual tokens in name
        - Preset postcode distances
        - Eval vs existing service
    """,
    threshold_match_probability=0.7
)

Probability two random records match is estimated to be  2.33e-07.
This means that amongst all possible pairwise record comparisons, one in 4,294,837.17 are expected to match.  With 1,368,138,787,675 total possible comparisons, we expect a total of around 318,554.29 matching pairs
----- Estimating u probabilities using random sampling -----
u probability not trained for name_unusual_tokens - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name_unusual_tokens (some u values are not trained, no m values are trained).
    - postcode (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:

                l.name_unusual_tokens = r.name_unusual_tokens
            

Parameter estimates will be made for the following comparison(s):


## Playing with the pipeline bit by bit

In [34]:
ch_x_exp = LinkDatasets(
    table_l = ch_settings,
    table_r = exp_settings,
    settings = settings,
    pipeline = pipeline
)

In [35]:
ch_x_exp.get_data()

In [36]:
ch_x_exp.preprocess_data()

In [37]:
ch_x_exp.create_linker()

In [38]:
ch_x_exp.train_linker()

Probability two random records match is estimated to be  2.33e-07.
This means that amongst all possible pairwise record comparisons, one in 4,294,837.17 are expected to match.  With 1,368,138,787,675 total possible comparisons, we expect a total of around 318,554.29 matching pairs
----- Estimating u probabilities using random sampling -----
u probability not trained for name_unusual_tokens - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name_unusual_tokens (some u values are not trained, no m values are trained).
    - postcode (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:

                l.name_unusual_tokens = r.name_unusual_tokens
            

Parameter estimates will be made for the following comparison(s):


In [39]:
ch_x_exp.predict(threshold_match_probability=0.7) 


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'name_unusual_tokens':
    m values not fully trained
Comparison: 'name_unusual_tokens':
    u values not fully trained


In [40]:
ch_x_exp.generate_report(sample=10)

{'eval_matches': 175842,
 'pred_matches': 152491,
 'both_eval_and_pred': 104636,
 'eval_only': 71206,
 'pred_only': 47855,
 'both_eval_and_pred_sample': [{'id_l': '10286497',
   'id_r': '1038391',
   'match_probability': 0.9998875592772737,
   'score': 4,
   'company_name_l_pred': 'KIKKA MARCO LTD',
   'postcode_l_pred': 'EN6 5AS',
   'company_name_r_pred': 'KIKKA MARCO LTD',
   'postcode_r_pred': 'EN6 5AS',
   'company_name_l_exist': 'KIKKA MARCO LTD',
   'postcode_l_exist': 'EN6 5AS',
   'company_name_r_exist': 'KIKKA MARCO LTD',
   'postcode_r_exist': 'EN6 5AS'},
  {'id_l': '10615340',
   'id_r': '2498206',
   'match_probability': 0.9959721759628796,
   'score': 4,
   'company_name_l_pred': 'A SPEC ENVIRONMENTAL LTD',
   'postcode_l_pred': 'GU15 3AJ',
   'company_name_r_pred': 'A SPEC ENVIRONMENTAL LTD',
   'postcode_r_pred': 'GU15 3AQ',
   'company_name_l_exist': 'A SPEC ENVIRONMENTAL LTD',
   'postcode_l_exist': 'GU15 3AJ',
   'company_name_r_exist': 'A SPEC ENVIRONMENTAL LTD',
  