In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame
InteractiveShell.ast_node_interactivity = "all"

# Companies House x HMRC exporters

I want to build this in a way that one can improve a link pair in a notebook, then deploy those changes to the link easily. I'm going to play with this idea here.

In [2]:
from src.data import utils as du
from src.models import utils as mu
from src.config import tables, stopwords
from src.features.clean_complex import clean_comp_names
from src.link.make_link import LinkDatasets

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

# import os
import logging
import mlflow
from functools import partial
from dotenv import load_dotenv, find_dotenv

In [12]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "id",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ],
}

In [4]:
pipeline = {
    "estimate_probability_two_random_records_match": {
        "deterministic_matching_rules": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """,
        "recall": 0.7
    },
    "estimate_u_using_random_sampling": {
        "max_pairs": 1e6
    },
    "estimate_parameters_using_expectation_maximisation": {
        "blocking_rule": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """
    }
}

In [5]:
ch_settings = {
    "name": '"companieshouse"."companies"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [6]:
exp_settings = {
    "name": '"hmrc"."trade__exporters"',
    "select": [
        "id::text",
        "company_name",
        "postcode"
    ],
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [13]:
ch_x_exp = LinkDatasets(
    table_l = ch_settings,
    table_r = exp_settings,
    settings = settings,
    pipeline = pipeline
)

In [14]:
ch_x_exp.get_data()

In [None]:
ch_x_exp.preprocess_data()

In [None]:
ch_x_exp.create_linker()

In [None]:
ch_x_exp.train_linker()

In [None]:
predictions = ch_x_exp.linker.predict(threshold_match_probability=0.7) 

In [None]:
predictions.as_pandas_dataframe().head(10)

In [None]:
existing = du.dataset(ch_x_exp.pair['eval'])

In [None]:
existing