In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
from IPython.core.interactiveshell import InteractiveShell

from cmf import make_deduper, process, query
from cmf.clean import company_name, company_number
from cmf.dedupers import Naive
from cmf.helpers import cleaner, cleaners

InteractiveShell.ast_node_interactivity = "all"

In [12]:
# Select
dh = query(
    selector={
        "dit.data_hub__companies": [
            "id",
            "name",
            "company_number",
        ]
    },
    model=None,
    return_type="pandas",
)

# Clean
col_prefix = "dit_data_hub__companies_"

cleaner_name = cleaner(function=company_name, arguments={"column": f"{col_prefix}name"})
cleaner_crn = cleaner(
    function=company_number, arguments={"column": f"{col_prefix}company_number"}
)
cleaner_name_dh = cleaners(cleaner_name, cleaner_crn)

dh_cleaned = process(data=dh, pipeline=cleaner_name_dh)

In [13]:
dh.shape
dh_cleaned.shape

(503449, 4)

(503449, 4)

In [14]:
dh_cleaned.head(3)

Unnamed: 0,data_sha1,dit_data_hub__companies_id,dit_data_hub__companies_name,dit_data_hub__companies_company_number
0,"[196, 247, 190, 128, 184, 190, 103, 122, 20, 4...",00002c8e-591a-e711-88ee-e4115bead28a,arensis corp,
1,"[24, 61, 93, 182, 46, 163, 186, 32, 56, 37, 47...",000042c1-a098-e211-a939-e4115bead28a,macrogen korea,
2,"[88, 139, 37, 72, 135, 153, 140, 176, 249, 217...",00008a29-e155-e411-985c-e4115bead28a,pixsan digital software,


In [16]:
dh_cleaned[
    ["dit_data_hub__companies_name", "dit_data_hub__companies_company_number"]
].drop_duplicates().shape[0]

482602

In [18]:
# Dedupe
dh_naive_deduper = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": f"{col_prefix}id",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped = dh_naive_deduper()

dh_deduped_df = dh_deduped.to_df()

In [20]:
dh_deduped_df.head(5)

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_dh,dit.data_hub__companies,9c5407b2-e557-430a-9c43-c45acb063fdc,dit.data_hub__companies,0cf1186e-a451-41c4-ae6b-d798716f7e99,1
1,basic_dh,dit.data_hub__companies,9c83b823-4af7-e611-bca1-e4115bead28a,dit.data_hub__companies,9b17c36f-20cc-4328-8bea-c7b5b2123fde,1
2,basic_dh,dit.data_hub__companies,9d06dbd0-06b8-42d0-8bef-8090a95af09c,dit.data_hub__companies,2e376ff5-9252-457b-984c-5f66930bd396,1
3,basic_dh,dit.data_hub__companies,9d3964e4-f94e-4a35-8dd1-9d3c480396d3,dit.data_hub__companies,f9802972-2881-4e3a-8d33-9a53d201361c,1
4,basic_dh,dit.data_hub__companies,9d44ecd6-51a1-4fa0-aba9-91ff13bca132,dit.data_hub__companies,e3b5424f-29de-48a8-a332-ecf490fe54e4,1
