# Companies House x HMRC exporters

I want to build this in a way that one can improve a link pair in a notebook, then deploy those changes to the link easily. I'm going to play with this idea here.

In [1]:
from src.data import utils as du
from src.models import utils as mu
from src.config import tables, stopwords
from src.features.clean_complex import clean_comp_names
from src.link.link import LinkDatasets

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

# import os
import logging
import mlflow
from functools import partial
from dotenv import load_dotenv, find_dotenv

In [23]:
companieshouse_raw = du.query(
    f"""
        select
            id::text as dim_uuid,
            company_name,
            postcode
        from
            {tables['"companieshouse"."companies"']["dim"]};
    """
)

companieshouse_proc = clean_comp_names(
    companieshouse_raw,
    primary_col="company_name",
    secondary_col=None,
    stopwords=stopwords,
)

In [14]:
companieshouse_proc.head(5)

Unnamed: 0,company_name_arr,dim_uuid,company_name,postcode,stopwords,name_unusual_tokens,names_tokens_stopwords,name_unusual_tokens_first5,name_unusual_tokens_last5
0,"[a, r, motors, limited]",12586403,A&R MOTORS LIMITED,NW10 2QU,"[limited, uk, company, international, group, o...",a motors r,limited,a mot,ors r
1,"[a, r, munteanu, limited]",14119280,A&R MUNTEANU LTD,E6 3LA,"[limited, uk, company, international, group, o...",a munteanu r,limited,a mun,anu r
2,"[a, r, nielsen, limited]",10238259,A&R NIELSEN LIMITED,EN5 5LG,"[limited, uk, company, international, group, o...",a nielsen r,limited,a nie,sen r
3,"[a, r, online, education, limited]",12722020,A&R ONLINE EDUCATION LTD,IG1 3FA,"[limited, uk, company, international, group, o...",a education online r,limited,a edu,ine r
4,"[a, r, painting, services, limited]",8441975,A&R PAINTING SERVICES LTD,CH64 5SJ,"[limited, uk, company, international, group, o...",a painting r services,limited,a pai,vices


In [24]:
hmrcexporters_raw = du.query(
    f"""
        select
            dim_uuid::text,
            company_name,
            postcode
        from
            {tables['"hmrc"."trade__exporters"']["dim"]};
    """
)

hmrcexporters_proc = clean_comp_names(
    hmrcexporters_raw,
    primary_col="company_name",
    secondary_col=None,
    stopwords=stopwords,
)

In [13]:
hmrcexporters_proc.head(5)

Unnamed: 0,company_name_arr,dim_uuid,company_name,postcode,stopwords,name_unusual_tokens,names_tokens_stopwords,name_unusual_tokens_first5,name_unusual_tokens_last5
0,"[ska, organisation]",df91e625-6ea6-437f-80db-a7fb3d7084ff,SKA ORGANISATION,SK11 9DL,"[limited, uk, company, international, group, o...",organisation ska,,organ,n ska
1,"[skapandi, limited]",2bf89964-bbe5-4b2a-b892-6eafd79975d2,SKAPANDI LTD,YO11 3UD,"[limited, uk, company, international, group, o...",skapandi,limited,skapa,pandi
2,"[skape, limited]",f17627f8-f06f-4256-85e2-0cc7efea0d4a,SKAPE LTD,BA5 1AF,"[limited, uk, company, international, group, o...",skape,limited,skape,skape
3,"[skardin, industrial, uk, limited]",4333cc4f-a41f-4a5c-9646-2f218709937e,SKARDIN INDUSTRIAL (UK) LIMITED,CM6 2JR,"[limited, uk, company, international, group, o...",industrial skardin,uk limited,indus,ardin
4,"[skardin, industrial, uk, limited]",324f409d-3e6f-456a-9f7e-90b09075fb56,SKARDIN INDUSTRIAL (UK) LIMITED,CM19 5QB,"[limited, uk, company, international, group, o...",industrial skardin,uk limited,indus,ardin


In [25]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "dim_uuid",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """,
        """
            (l.name_unusual_tokens_first5 = r.name_unusual_tokens_first5)
            and (
                length(l.name_unusual_tokens_first5) = 5
                and length(r.name_unusual_tokens_first5) = 5
            )
        """,
        """
            (l.name_unusual_tokens_last5 = r.name_unusual_tokens_last5)
            and (
                length(l.name_unusual_tokens_last5) = 5
                and length(r.name_unusual_tokens_last5) = 5
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ],
}

In [26]:
linker = DuckDBLinker(
    input_table_or_tables = [
        companieshouse_proc, 
        hmrcexporters_proc
    ],
    settings_dict = settings,
    input_table_aliases=[
        'companieshouse_companies', 
        'hmrc_trade__exporters'
    ],
)

In [27]:
linker.estimate_probability_two_random_records_match(
    "l.name_unusual_tokens = r.name_unusual_tokens",
    recall=0.7,
)

In [2]:
linker.estimate_u_using_random_sampling(max_pairs=1e6)

NameError: name 'linker' is not defined

In [None]:
m_by_name_and_postcode = """
    l.name_unusual_tokens = r.name_unusual_tokens
    and l.postcode = r.postcode
"""
linker.estimate_parameters_using_expectation_maximisation(
    m_by_name_and_postcode_area
)

In [None]:
predictions = linker.predict(threshold_match_probability=0.7) 

In [None]:
predictions

## As a class

Let's try it.

In [2]:
settings = {
    "link_type": "link_only",
    "unique_id_column_name": "dim_uuid",
    "retain_matching_columns": False,
    "retain_intermediate_calculation_columns": False,
    "blocking_rules_to_generate_predictions": [
        """
            (l.name_unusual_tokens = r.name_unusual_tokens)
            and (
                l.name_unusual_tokens <> ''
                and r.name_unusual_tokens <> ''
            )
        """,
        """
            (l.postcode = r.postcode)
            and (
                l.postcode <> ''
                and r.postcode <> ''
            )
        """,
        """
            (l.name_unusual_tokens_first5 = r.name_unusual_tokens_first5)
            and (
                length(l.name_unusual_tokens_first5) = 5
                and length(r.name_unusual_tokens_first5) = 5
            )
        """,
        """
            (l.name_unusual_tokens_last5 = r.name_unusual_tokens_last5)
            and (
                length(l.name_unusual_tokens_last5) = 5
                and length(r.name_unusual_tokens_last5) = 5
            )
        """
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds(
            "name_unusual_tokens", [0.9, 0.6], term_frequency_adjustments=True
        ),
        ctl.postcode_comparison("postcode")
    ],
}

In [4]:
pipeline = {
    "estimate_probability_two_random_records_match": {
        "deterministic_matching_rules": """
            l.name_unusual_tokens = r.name_unusual_tokens
        """,
        "recall": 0.7
    },
    "estimate_u_using_random_sampling": {
        "max_pairs": 1e6
    },
    "estimate_parameters_using_expectation_maximisation": {
        "blocking_rule": """
            l.name_unusual_tokens = r.name_unusual_tokens
            and l.postcode = r.postcode
        """
    }
}

In [None]:
ch_settings = {
    "name": "",
    "select": ""
    "preproc": {
        clean_comp_names: {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords
        }
    }
}

In [7]:
ch_x_exp = LinkDatasets(
    table_l = '"companieshouse"."companies"',
    table_l_select = [
        "id::text as dim_uuid",
        "company_name",
        "postcode"
    ],
    table_l_preproc = [
        partial(
            clean_comp_names,
            primary_col="company_name",
            secondary_col=None,
            stopwords=stopwords
        )
    ],
    table_r = '"hmrc"."trade__exporters"',
    table_r_select = [
        "dim_uuid::text",
        "company_name",
        "postcode"
    ],
    table_r_preproc = [
        partial(
            clean_comp_names,
            primary_col="company_name",
            secondary_col=None,
            stopwords=stopwords
        )
    ],
    settings = settings,
    pipeline = pipeline
)

In [8]:
ch_x_exp.get_data()

  meta = MetaData(self.connectable, schema=schema)


In [9]:
ch_x_exp.preprocess_data()

In [10]:
ch_x_exp.table_l_proc.head(5)

Unnamed: 0,company_name_arr,dim_uuid,company_name,postcode,stopwords,name_unusual_tokens,names_tokens_stopwords,name_unusual_tokens_first5,name_unusual_tokens_last5
0,"[a, r, motors, limited]",12586403,A&R MOTORS LIMITED,NW10 2QU,"[limited, uk, company, international, group, o...",a motors r,limited,a mot,ors r
1,"[a, r, munteanu, limited]",14119280,A&R MUNTEANU LTD,E6 3LA,"[limited, uk, company, international, group, o...",a munteanu r,limited,a mun,anu r
2,"[a, r, nielsen, limited]",10238259,A&R NIELSEN LIMITED,EN5 5LG,"[limited, uk, company, international, group, o...",a nielsen r,limited,a nie,sen r
3,"[a, r, online, education, limited]",12722020,A&R ONLINE EDUCATION LTD,IG1 3FA,"[limited, uk, company, international, group, o...",a education online r,limited,a edu,ine r
4,"[a, r, painting, services, limited]",8441975,A&R PAINTING SERVICES LTD,CH64 5SJ,"[limited, uk, company, international, group, o...",a painting r services,limited,a pai,vices


In [11]:
ch_x_exp.create_linker()
ch_x_exp.train_linker()

AttributeError: 'DuckDBLinker' object has no attribute 'k'