In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame
InteractiveShell.ast_node_interactivity = "all"

# Companies House x HMRC exporters

I want to build this in a way that one can improve a link pair in a notebook, then deploy those changes to the link easily. I'm going to play with this idea here.

In [2]:
from src.data import utils as du
from src.models import utils as mu
from src.config import tables, stopwords
from src.features.clean_complex import clean_comp_names
from src.link.make_link import LinkDatasets

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

# import os
import logging
import mlflow
from functools import partial
from dotenv import load_dotenv, find_dotenv

In [3]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "id",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ],
}

In [4]:
pipeline = {
    "estimate_probability_two_random_records_match": {
        "deterministic_matching_rules": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """,
        "recall": 0.7
    },
    "estimate_u_using_random_sampling": {
        "max_pairs": 1e6
    },
    "estimate_parameters_using_expectation_maximisation": {
        "blocking_rule": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """
    }
}

In [5]:
ch_settings = {
    "name": '"companieshouse"."companies"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [6]:
exp_settings = {
    "name": '"hmrc"."trade__exporters"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [7]:
ch_x_exp = LinkDatasets(
    table_l = ch_settings,
    table_r = exp_settings,
    settings = settings,
    pipeline = pipeline
)

In [8]:
ch_x_exp.get_data()

  meta = MetaData(self.connectable, schema=schema)


In [9]:
ch_x_exp.preprocess_data()

In [10]:
ch_x_exp.create_linker()

In [11]:
ch_x_exp.train_linker()

Probability two random records match is estimated to be  2.33e-07.
This means that amongst all possible pairwise record comparisons, one in 4,294,837.17 are expected to match.  With 1,368,138,787,675 total possible comparisons, we expect a total of around 318,554.29 matching pairs
----- Estimating u probabilities using random sampling -----
u probability not trained for name_unusual_tokens - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name_unusual_tokens (some u values are not trained, no m values are trained).
    - postcode (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:

            l.name_unusual_tokens = r.name_unusual_tokens
        

Parameter estimates will be made for the following comparison(s):
    - po

In [12]:
ch_x_exp.predict(threshold_match_probability=0.7) 


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'name_unusual_tokens':
    m values not fully trained
Comparison: 'name_unusual_tokens':
    u values not fully trained


In [13]:
ch_x_exp.generate_report(sample=10)

{'eval_matches': 175842,
 'pred_matches': 152541,
 'both_eval_and_pred': 104647,
 'eval_only': 71195,
 'pred_only': 47894,
 'both_eval_and_pred_sample': [{'id_l': '02900615',
   'id_r': '1448544',
   'match_probability': 0.9999811108482505,
   'score': 4,
   'company_name_l_pred': 'RESCROFT LIMITED',
   'postcode_l_pred': 'B98 0RE',
   'company_name_r_pred': 'RESCROFT LIMITED',
   'postcode_r_pred': 'B98 0RE',
   'company_name_l_exist': 'RESCROFT LIMITED',
   'postcode_l_exist': 'B98 0RE',
   'company_name_r_exist': 'RESCROFT LIMITED',
   'postcode_r_exist': 'B98 0RE'},
  {'id_l': '03777854',
   'id_r': '348072',
   'match_probability': 0.9999716665399733,
   'score': 5,
   'company_name_l_pred': 'ARBOR VITAE LIMITED',
   'postcode_l_pred': 'WF10 1BY',
   'company_name_r_pred': 'ARBOR VITAE LTD',
   'postcode_r_pred': 'WF10 1BY',
   'company_name_l_exist': 'ARBOR VITAE LIMITED',
   'postcode_l_exist': 'WF10 1BY',
   'company_name_r_exist': 'ARBOR VITAE LTD',
   'postcode_r_exist': 'WF1

In [60]:
(
    agree
    .sample(10)
    .to_dict(orient='records')
)

[{'id_l': '04191274',
  'id_r': '2442232',
  'match_probability': 0.9997962447240717,
  'score': 4,
  'company_name_l_pred': 'LINDSTROM LIMITED',
  'postcode_l_pred': 'MK41 0TY',
  'company_name_r_pred': 'LINDSTROM LIMITED',
  'postcode_r_pred': 'MK41 0TY',
  'company_name_l_exist': 'LINDSTROM LIMITED',
  'postcode_l_exist': 'MK41 0TY',
  'company_name_r_exist': 'LINDSTROM LIMITED',
  'postcode_r_exist': 'MK41 0TY'},
 {'id_l': '11172250',
  'id_r': '2934795',
  'match_probability': 0.9947507544206163,
  'score': 4,
  'company_name_l_pred': 'LEAF DESIGN UK LIMITED',
  'postcode_l_pred': 'LN11 0WA',
  'company_name_r_pred': 'LEAF DESIGN UK LTD',
  'postcode_r_pred': 'LN11 0BA',
  'company_name_l_exist': 'LEAF DESIGN UK LIMITED',
  'postcode_l_exist': 'LN11 0WA',
  'company_name_r_exist': 'LEAF DESIGN UK LTD',
  'postcode_r_exist': 'LN11 0BA'},
 {'id_l': 'SC167094',
  'id_r': '2338513',
  'match_probability': 0.9995925724636534,
  'score': 4,
  'company_name_l_pred': 'ORB INTERNATIONAL KM