In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame
InteractiveShell.ast_node_interactivity = "all"

# Companies House x HMRC exporters

I want to build this in a way that one can improve a link pair in a notebook, then deploy those changes to the link easily. I'm going to play with this idea here.

In [2]:
from src.data import utils as du
from src.models import utils as mu
from src.config import tables, stopwords
from src.features.clean_complex import clean_comp_names
from src.link.make_link import LinkDatasets

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

# import os
import logging
import mlflow
from functools import partial
from dotenv import load_dotenv, find_dotenv

In [3]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "id",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ],
}

In [4]:
pipeline = {
    "estimate_probability_two_random_records_match": {
        "deterministic_matching_rules": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """,
        "recall": 0.7
    },
    "estimate_u_using_random_sampling": {
        "max_pairs": 1e6
    },
    "estimate_parameters_using_expectation_maximisation": {
        "blocking_rule": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """
    }
}

In [5]:
ch_settings = {
    "name": '"companieshouse"."companies"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [6]:
exp_settings = {
    "name": '"hmrc"."trade__exporters"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [7]:
ch_x_exp = LinkDatasets(
    table_l = ch_settings,
    table_r = exp_settings,
    settings = settings,
    pipeline = pipeline
)

In [8]:
ch_x_exp.get_data()

  meta = MetaData(self.connectable, schema=schema)


In [9]:
ch_x_exp.preprocess_data()

In [10]:
ch_x_exp.create_linker()

In [11]:
ch_x_exp.train_linker()

Probability two random records match is estimated to be  2.33e-07.
This means that amongst all possible pairwise record comparisons, one in 4,294,837.17 are expected to match.  With 1,368,138,787,675 total possible comparisons, we expect a total of around 318,554.29 matching pairs
----- Estimating u probabilities using random sampling -----
u probability not trained for name_unusual_tokens - Exact match (comparison vector value: 3). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name_unusual_tokens (some u values are not trained, no m values are trained).
    - postcode (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:

            l.name_unusual_tokens = r.name_unusual_tokens
        

Parameter estimates will be made for the following comparison(s):
    - po

In [12]:
ch_x_exp.predict(threshold_match_probability=0.7) 


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'name_unusual_tokens':
    m values not fully trained
Comparison: 'name_unusual_tokens':
    u values not fully trained


In [27]:
existing = (
    du.dataset(ch_x_exp.pair['eval'])
    .merge(
        ch_x_exp.table_l_raw.add_suffix('_l'),
        how='left',
        left_on=['id_l'],
        right_on=['id_l'],
        suffixes=('', '_remove')
    )
    .merge(
        ch_x_exp.table_r_raw.add_suffix('_r'),
        how='left',
        left_on=['id_r'],
        right_on=['id_r'],
        suffixes=('', '_remove')
    )
    .filter(regex='^((?!remove).)*$')
)

In [28]:
predictions = (
    ch_x_exp.predictions
    .as_pandas_dataframe()
    .sort_values(
        by=['match_probability'],
        ascending=False
    )
    .drop_duplicates(
        subset=['id_l', 'id_r'],
        keep='first')
    .merge(
        ch_x_exp.table_l_raw.add_suffix('_l'),
        how='left',
        left_on=['id_l'],
        right_on=['id_l'],
        suffixes=('', '_remove')
    )
    .merge(
        ch_x_exp.table_r_raw.add_suffix('_r'),
        how='left',
        left_on=['id_r'],
        right_on=['id_r'],
        suffixes=('', '_remove')
    )
    .filter(regex='^((?!remove).)*$')
)

In [29]:
agree = (
    predictions
    .merge(
        existing,
        how='inner',
        on=['id_l', 'id_r']
    )
)

In [30]:
disagree = (
    predictions
    .merge(
        existing, 
        how='outer', 
        on=['id_l', 'id_r'],
        indicator=True
    )
)

prediction_only = disagree[(disagree._merge=='left_only')].drop('_merge', axis=1)
existing_only = disagree[(disagree._merge=='right_only')].drop('_merge', axis=1)

In [31]:
agree.count()
agree.head(5)

match_weight         102131
match_probability    102131
source_dataset_l     102131
source_dataset_r     102131
id_l                 102131
id_r                 102131
match_key            102131
company_name_l_x     102131
postcode_l_x         102131
company_name_r_x     102131
postcode_r_x         102131
cluster              102131
score                102131
company_name_l_y     102131
postcode_l_y         102131
company_name_r_y     102131
postcode_r_y         102131
dtype: int64

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,id_l,id_r,match_key,company_name_l_x,postcode_l_x,company_name_r_x,postcode_r_x,cluster,score,company_name_l_y,postcode_l_y,company_name_r_y,postcode_r_y
0,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,2748422,3365816,0,ANGLIA CROWN LIMITED,CO4 9WN,ANGLIA CROWN LTD,CO4 9WN,3360823,5,ANGLIA CROWN LIMITED,CO4 9WN,ANGLIA CROWN LTD,CO4 9WN
1,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,1124430,66991,0,ANGLIA CNC ENGINEERING LTD,PE27 3LE,ANGLIA CNC ENGINEERING LTD,PE27 3LE,2353345,4,ANGLIA CNC ENGINEERING LTD,PE27 3LE,ANGLIA CNC ENGINEERING LTD,PE27 3LE
2,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,3221366,2286190,0,ANGLIA CARGO INTERNATIONAL LIMITED,NR6 6HP,ANGLIA CARGO INTERNATIONAL LTD,NR6 6HP,435128,5,ANGLIA CARGO INTERNATIONAL LIMITED,NR6 6HP,ANGLIA CARGO INTERNATIONAL LTD,NR6 6HP
3,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,5797585,2988060,0,ANGLE EUROPE LIMITED,GU2 7AF,ANGLE EUROPE LIMITED,GU2 7AF,15692,3,ANGLE EUROPE LIMITED,GU2 7AF,ANGLE EUROPE LIMITED,GU2 7AF
4,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,1215741,315516,0,WORLD NUCLEAR ASSOCIATION,WC2E 7HA,WORLD NUCLEAR ASSOCIATION,WC2E 7HA,25407,4,WORLD NUCLEAR ASSOCIATION,WC2E 7HA,WORLD NUCLEAR ASSOCIATION,WC2E 7HA


In [32]:
prediction_only.count()
prediction_only.head(5)

match_weight         32595
match_probability    32595
source_dataset_l     32595
source_dataset_r     32595
id_l                 32595
id_r                 32595
match_key            32595
company_name_l_x     32595
postcode_l_x         32595
company_name_r_x     32595
postcode_r_x         32595
cluster                  0
score                    0
company_name_l_y         0
postcode_l_y             0
company_name_r_y         0
postcode_r_y             0
dtype: int64

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,id_l,id_r,match_key,company_name_l_x,postcode_l_x,company_name_r_x,postcode_r_x,cluster,score,company_name_l_y,postcode_l_y,company_name_r_y,postcode_r_y
45,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,07251600,2609658,0,ANGLO AMERICAN WOODSMITH LIMITED,EC1N 6RA,ANGLO AMERICAN WOODSMITH LIMITED,EC1N 6RA,,,,,,
109,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,05743586,231418,0,AMBI-RAD GROUP LIMITED,DY5 1QA,AMBI-RAD LIMITED,DY5 1QA,,,,,,
119,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,SC521235,2590909,0,AMERICAN HOT TUBS LIMITED,FK3 8WX,AMERICAN HOT TUBS LIMITED,FK3 8WX,,,,,,
151,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,04688688,2782549,0,AMETHYST GLOBAL FREIGHT LIMITED,CR0 1ED,AMETHYST GLOBAL FREIGHT LIMITED,CR0 1ED,,,,,,
207,12.260581,0.999796,companieshouse_companies,hmrc_trade__exporters,04022603,2909708,0,PANSERVE LIMITED,BN3 1RE,PANSERVE LIMITED,BN3 1RE,,,,,,


In [33]:
existing_only.count()
existing_only.head(5)

match_weight             0
match_probability        0
source_dataset_l         0
source_dataset_r         0
id_l                 73711
id_r                 73711
match_key                0
company_name_l_x         0
postcode_l_x             0
company_name_r_x         0
postcode_r_x             0
cluster              73711
score                73711
company_name_l_y     73711
postcode_l_y         73711
company_name_r_y     73711
postcode_r_y         73711
dtype: int64

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,id_l,id_r,match_key,company_name_l_x,postcode_l_x,company_name_r_x,postcode_r_x,cluster,score,company_name_l_y,postcode_l_y,company_name_r_y,postcode_r_y
134726,,,,,SO307775,1926564,,,,,,6913209.0,4.0,DVNE LLP,EH6 7JG,OWEN NORMAND,EH6 7JG
134727,,,,,SC767783,2339494,,,,,,6913049.0,4.0,XENOLITH VENDING LTD,G42 7AF,HABIB AHMED,G42 7AF
134728,,,,,SC767657,211654,,,,,,6912923.0,4.0,DATA NET TELECOM CIVILS LIMITED,PA23 8PB,CORYDON MACRAE,PA23 8PB
134729,,,,,SC767560,481876,,,,,,6912826.0,4.0,AAT PROJECTS AND REPAIRS LIMITED,KA8 9DJ,DOUBLE GLAZING PARTS & REPAIRS,KA8 9DJ
134730,,,,,SC767481,408833,,,,,,6912747.0,4.0,TINY BRICKS LTD,EH11 2PP,SATWINDER SINGH LANDA,EH11 2PP


In [31]:
agree.shape[0]

104647

In [33]:
(
    agree
    [['id_l', 'id_r', 'score', 'match_probability']]
    .sample(10)
)

Unnamed: 0,id_l,id_r,score,match_probability
1101,8577605,2555723,5,0.999969
23931,5365424,162685,4,0.999969
93129,3154801,1752333,4,0.916819
3766,4253364,3226319,5,0.999969
20710,11434953,2221788,4,0.999969
35099,1071951,2147567,4,0.999969
63584,8434968,2892540,4,0.999953
103925,8599466,913101,4,0.846414
102573,9078567,127292,4,0.880211
56804,4740333,871333,5,0.999953
