In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from IPython.core.interactiveshell import InteractiveShell

from cmf import make_deduper, process, query
from cmf.clean import company_name, company_number
from cmf.dedupers import Naive
from cmf.helpers import cleaner, cleaners

InteractiveShell.ast_node_interactivity = "all"

In [6]:
# Select
dh = query(
    selector={
        "dit.data_hub__companies": [
            "id",
            "name",
            "company_number",
        ]
    },
    model=None,
    return_type="pandas",
)

# Clean
col_prefix = "dit_data_hub__companies_"

cleaner_name = cleaner(function=company_name, arguments={"column": f"{col_prefix}name"})
cleaner_crn = cleaner(
    function=company_number, arguments={"column": f"{col_prefix}company_number"}
)
cleaner_name_dh = cleaners(cleaner_name, cleaner_crn)

dh_cleaned = process(data=dh, pipeline=cleaner_name_dh)

In [7]:
dh.shape
dh_cleaned.shape

(503449, 4)

(503449, 4)

In [8]:
dh_cleaned.head(3)

Unnamed: 0,data_sha1,dit_data_hub__companies_id,dit_data_hub__companies_name,dit_data_hub__companies_company_number
0,"[196, 247, 190, 128, 184, 190, 103, 122, 20, 4...",00002c8e-591a-e711-88ee-e4115bead28a,arensis corp,
1,"[24, 61, 93, 182, 46, 163, 186, 32, 56, 37, 47...",000042c1-a098-e211-a939-e4115bead28a,macrogen korea,
2,"[88, 139, 37, 72, 135, 153, 140, 176, 249, 217...",00008a29-e155-e411-985c-e4115bead28a,pixsan digital software,


In [9]:
dh_cleaned[
    ["dit_data_hub__companies_name", "dit_data_hub__companies_company_number"]
].drop_duplicates().shape[0]

482602

In [10]:
# Dedupe
dh_naive_deduper = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": f"{col_prefix}id",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped = dh_naive_deduper()

dh_deduped_df = dh_deduped.to_df()

In [11]:
dh_deduped_df.head(5)

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_dh,dit.data_hub__companies,34f3e1b5-f612-e611-9bdc-e4115bead28a,dit.data_hub__companies,6bd85b41-ebd2-43ad-99b9-399fca511176,1
1,basic_dh,dit.data_hub__companies,34779711-2a85-4fea-b4e1-07226cc10425,dit.data_hub__companies,6f3201cf-d483-4ce2-8c2c-c20e74a11f97,1
2,basic_dh,dit.data_hub__companies,35519dfa-3c1a-4389-a452-141e7e84a289,dit.data_hub__companies,0a83eefa-68b2-4852-b0fa-edf08828debf,1
3,basic_dh,dit.data_hub__companies,35cb9542-1a51-4f32-b614-c5f77878a3f2,dit.data_hub__companies,c3247c4f-4ee1-4500-a43c-61843964bc9e,1
4,basic_dh,dit.data_hub__companies,3606e768-538b-e611-be23-e4115bead28a,dit.data_hub__companies,7cf553b5-a098-e211-a939-e4115bead28a,1


In [12]:
dh_deduped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        2166 non-null   object
 1   left         2166 non-null   object
 2   left_id      2166 non-null   object
 3   right        2166 non-null   object
 4   right_id     2166 non-null   object
 5   probability  2166 non-null   int32 
dtypes: int32(1), object(5)
memory usage: 93.2+ KB


In [16]:
dh_deduped._prep_to_cmf(dh_deduped_df)[:5]

  df.assign(


[{'model': 'basic_dh',
  'left': UUID('34f3e1b5-f612-e611-9bdc-e4115bead28a'),
  'right': UUID('6bd85b41-ebd2-43ad-99b9-399fca511176'),
  'probability': 1,
  'sha1': b']\xff\x1c``\xad\t:[\x80\x83\xa6\xc43x\x0f!\n\xc7\x8d'},
 {'model': 'basic_dh',
  'left': UUID('34779711-2a85-4fea-b4e1-07226cc10425'),
  'right': UUID('6f3201cf-d483-4ce2-8c2c-c20e74a11f97'),
  'probability': 1,
  'sha1': b'5\x1c*m&\x96Y\xda\x0c\xfd5\xde\xf9\xf4\x83\t2N@)'},
 {'model': 'basic_dh',
  'left': UUID('35519dfa-3c1a-4389-a452-141e7e84a289'),
  'right': UUID('0a83eefa-68b2-4852-b0fa-edf08828debf'),
  'probability': 1,
  'sha1': b'\xc0\xaf\xe1\x03\xec\xc9\x1a\x98\x1d\xba\xaaV\x88JIw\xfbo\x03\xde'},
 {'model': 'basic_dh',
  'left': UUID('35cb9542-1a51-4f32-b614-c5f77878a3f2'),
  'right': UUID('c3247c4f-4ee1-4500-a43c-61843964bc9e'),
  'probability': 1,
  'sha1': b'\x05\xf8\xba\xad\xd7,\xcaT\xbdVY\x04C\x88a\x9a\xd83x\x93'},
 {'model': 'basic_dh',
  'left': UUID('3606e768-538b-e611-be23-e4115bead28a'),
  'right': U