In [1]:
%load_ext autoreload
%autoreload 2

In [87]:
import uuid
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

import cmf.locations as loc
from cmf import make_deduper, process, query
from cmf.clean import company_name, company_number
from cmf.dedupers import Naive
from cmf.helpers import cleaner, cleaners

InteractiveShell.ast_node_interactivity = "all"

In [68]:
df = pd.read_csv(Path(loc.TEST, "data", "all_companies.csv")).reset_index(names="id")
df["id"] = df["id"].apply(lambda x: uuid.UUID(int=x))

In [119]:
# df["company_name"] + " Corp"
df.assign(company_name=lambda df: df["company_name"] + " Corp")

Unnamed: 0,id,company_name,crn,address,duns,cdms
0,00000000-0000-0000-0000-000000000000,Dooley Inc Corp,01HHGX9BHARZT77WHVWCYJSWSF,Room 577,111111111,ORG-55555555
1,00000000-0000-0000-0000-000000000001,Stiedemann and Sons Corp,01HHGX9BHF9HS4Z9E3FYGY7R92,Room 1143,111111112,ORG-55555556
2,00000000-0000-0000-0000-000000000002,Pfeffer Inc Corp,01HHGX9BHG70V8V6ZXVTJPJ7PX,Apt 186,111111113,ORG-55555557
3,00000000-0000-0000-0000-000000000003,Connelly-Goyette Corp,01HHGX9BHH87FFA2CPCJRXNJJ7,9th Floor,111111114,ORG-55555558
4,00000000-0000-0000-0000-000000000004,Altenwerth-Bechtelar Corp,01HHGX9BHJWQ1X6Y1591X5Y51Q,Room 1877,111111115,ORG-55555559
...,...,...,...,...,...,...
995,00000000-0000-0000-0000-0000000003e3,"Upton, Ruecker and Hayes Corp",01HHGX9CW3WV178BRCJEXPMK2D,PO Box 45512,111112106,ORG-55556550
996,00000000-0000-0000-0000-0000000003e4,Keebler and Sons Corp,01HHGX9CW4PB75HRA1Z6HQ0Q4A,PO Box 96538,111112107,ORG-55556551
997,00000000-0000-0000-0000-0000000003e5,"Goldner, Moen and Gorczany Corp",01HHGX9CW5528RVGEKMP0QFHTQ,Room 719,111112108,ORG-55556552
998,00000000-0000-0000-0000-0000000003e6,Brekke-Christiansen Corp,01HHGX9CW6KT628M831VKKPN7E,Suite 72,111112109,ORG-55556553


In [97]:
df_crn = pd.DataFrame(np.repeat(df.filter(["company_name", "crn"]).values, 3, axis=0))
df_crn.columns = ["company_name", "crn"]
df_crn.reset_index(names="id", inplace=True)
df_crn["id"] = df_crn["id"].apply(lambda x: uuid.UUID(int=x))
df_crn.head(7)

Unnamed: 0,id,company_name,crn
0,00000000-0000-0000-0000-000000000000,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF
1,00000000-0000-0000-0000-000000000001,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF
2,00000000-0000-0000-0000-000000000002,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF
3,00000000-0000-0000-0000-000000000003,Stiedemann and Sons,01HHGX9BHF9HS4Z9E3FYGY7R92
4,00000000-0000-0000-0000-000000000004,Stiedemann and Sons,01HHGX9BHF9HS4Z9E3FYGY7R92
5,00000000-0000-0000-0000-000000000005,Stiedemann and Sons,01HHGX9BHF9HS4Z9E3FYGY7R92
6,00000000-0000-0000-0000-000000000006,Pfeffer Inc,01HHGX9BHG70V8V6ZXVTJPJ7PX


In [111]:
# Dedupe
df_naive_deduper = make_deduper(
    dedupe_run_name="basic_crn",
    description="Clean company name, company number",
    deduper=Naive,
    deduper_settings={
        "id": "id",
        "unique_fields": ["company_name", "crn"],
    },
    data_source="foo",
    data=df_crn,
)

df_deduped = df_naive_deduper()

df_deduped_df = df_deduped.to_df()


When deduplicating to write back to the Company Matching Framework database, the ID must be data_sha1, generated by retrieving data with cmf.query().


In [112]:
df_deduped_df.shape[0]
df_deduped_df.head(3)

3000

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_crn,foo,00000000-0000-0000-0000-000000000003,foo,00000000-0000-0000-0000-000000000005,1
1,basic_crn,foo,00000000-0000-0000-0000-00000000000d,foo,00000000-0000-0000-0000-00000000000e,1
2,basic_crn,foo,00000000-0000-0000-0000-000000000019,foo,00000000-0000-0000-0000-00000000001a,1


In [106]:
dh_deduped.inspect_with_source(
    left_data=df_crn, left_key="id", right_data=df_crn, right_key="id"
).sample(3)

Unnamed: 0,left_id,right_id,company_name_x,crn_x,company_name_y,crn_y
825,b7a4eb92-35b4-4833-82fc-adbc644bcdbe,98353f41-a9bd-43e0-91e8-16e0964e2b91,,,,
1738,1dea1f3b-4c00-43c8-8837-611124d2b32e,c7278da8-a048-4b5e-b2be-435f86a2f98a,,,,
676,23c0afb3-c32b-4fd3-9773-09ccbd2b0398,e0f7664f-22f9-4aba-b9ed-5766a41422ba,,,,


In [85]:
import duckdb

join_clause = []
for field in ["company_name", "crn"]:
    join_clause.append(f"l.{field} = r.{field}")
join_clause_compiled = " and ".join(join_clause)

duckdb.sql(
    f"""
    select distinct on (list_sort([raw.left_id, raw.right_id]))
        raw.left_id,
        raw.right_id,
        1 as probability
    from (
        select
            l.id as left_id,
            r.id as right_id
        from
            df_cleaned l
        inner join df_cleaned r on
            (
                {join_clause_compiled}
            )
    ) raw;
"""
).df()

Unnamed: 0,left_id,right_id,probability
0,00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-000000000000,1
1,00000000-0000-0000-0000-000000000003,00000000-0000-0000-0000-000000000003,1
2,00000000-0000-0000-0000-000000000011,00000000-0000-0000-0000-000000000011,1
3,00000000-0000-0000-0000-00000000001c,00000000-0000-0000-0000-00000000001c,1
4,00000000-0000-0000-0000-00000000001e,00000000-0000-0000-0000-00000000001e,1
...,...,...,...
995,00000000-0000-0000-0000-000000000288,00000000-0000-0000-0000-000000000288,1
996,00000000-0000-0000-0000-00000000028e,00000000-0000-0000-0000-00000000028e,1
997,00000000-0000-0000-0000-0000000002bc,00000000-0000-0000-0000-0000000002bc,1
998,00000000-0000-0000-0000-00000000037b,00000000-0000-0000-0000-00000000037b,1


In [69]:
# Select
dh = query(
    selector={
        "dit.data_hub__companies": [
            "id",
            "name",
            "company_number",
        ]
    },
    model=None,
    return_type="pandas",
)

# Clean
col_prefix = "dit_data_hub__companies_"

cleaner_name = cleaner(function=company_name, arguments={"column": f"{col_prefix}name"})
cleaner_crn = cleaner(
    function=company_number, arguments={"column": f"{col_prefix}company_number"}
)
cleaner_name_dh = cleaners(cleaner_name, cleaner_crn)

dh_cleaned = process(data=dh, pipeline=cleaner_name_dh)

In [26]:
dh.shape
dh_cleaned.shape

(503449, 4)

(503449, 4)

In [27]:
dh_cleaned.head(3)

Unnamed: 0,data_sha1,dit_data_hub__companies_id,dit_data_hub__companies_name,dit_data_hub__companies_company_number
0,"[196, 247, 190, 128, 184, 190, 103, 122, 20, 4...",00002c8e-591a-e711-88ee-e4115bead28a,arensis corp,
1,"[24, 61, 93, 182, 46, 163, 186, 32, 56, 37, 47...",000042c1-a098-e211-a939-e4115bead28a,macrogen korea,
2,"[88, 139, 37, 72, 135, 153, 140, 176, 249, 217...",00008a29-e155-e411-985c-e4115bead28a,pixsan digital software,


In [9]:
dh_cleaned[
    ["dit_data_hub__companies_name", "dit_data_hub__companies_company_number"]
].drop_duplicates().shape[0]

482602

In [28]:
# Dedupe
dh_naive_deduper = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": f"{col_prefix}id",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped = dh_naive_deduper()

dh_deduped_df = dh_deduped.to_df()

In [29]:
dh_deduped_df.head(5)

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_dh,dit.data_hub__companies,0085222b-68be-e311-a3d5-e4115bead28a,dit.data_hub__companies,10c93ac7-a098-e211-a939-e4115bead28a,1
1,basic_dh,dit.data_hub__companies,0115403a-ea34-e411-985c-e4115bead28a,dit.data_hub__companies,0d3a2831-da11-4158-b2be-da9a991139bc,1
2,basic_dh,dit.data_hub__companies,018649bb-a098-e211-a939-e4115bead28a,dit.data_hub__companies,d41b30ca-fae9-e611-bca1-e4115bead28a,1
3,basic_dh,dit.data_hub__companies,01b049bb-a098-e211-a939-e4115bead28a,dit.data_hub__companies,d439a050-10b1-e411-a839-e4115bead28a,1
4,basic_dh,dit.data_hub__companies,02987d74-681f-4558-b248-24023bcab4cf,dit.data_hub__companies,2f524795-c9cd-4b89-aa38-2e4d16ef0b7a,1


In [42]:
# Dedupe
dh_naive_deduper2 = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": "data_sha1",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped2 = dh_naive_deduper2()

dh_deduped_df2 = dh_deduped2.to_df()

In [66]:
from sqlalchemy.orm import Session

from cmf.data import ENGINE, SourceData

with Session(ENGINE) as session:
    data_inner_join = session.query(SourceData).limit(10).all()

In [67]:
data_inner_join

[<cmf.data.data.SourceData at 0x7f8acbd4af40>,
 <cmf.data.data.SourceData at 0x7f8acbd4afa0>,
 <cmf.data.data.SourceData at 0x7f8acbd4a130>,
 <cmf.data.data.SourceData at 0x7f8acbd54070>,
 <cmf.data.data.SourceData at 0x7f8acbd540d0>,
 <cmf.data.data.SourceData at 0x7f8acbd54130>,
 <cmf.data.data.SourceData at 0x7f8acbd54190>,
 <cmf.data.data.SourceData at 0x7f8acbd541f0>,
 <cmf.data.data.SourceData at 0x7f8acbd54250>,
 <cmf.data.data.SourceData at 0x7f8acbd542b0>]

In [48]:
bytearray(dh_deduped2.dataframe["left_id"][0].encode())
dh_deduped2.dataframe["left_id"][0]
dh_deduped2.dataframe.info()

bytearray(b"bytearray(b\'\\x0c\\xa6*\\x8e\\x00:\\xd7\\xd9^\\x0fF\\x82\\xa7\\x89}\\xe6Fb\\x93\\x87\')")

"bytearray(b'\\x0c\\xa6*\\x8e\\x00:\\xd7\\xd9^\\x0fF\\x82\\xa7\\x89}\\xe6Fb\\x93\\x87')"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   left_id      2166 non-null   object
 1   right_id     2166 non-null   object
 2   probability  2166 non-null   int32 
dtypes: int32(1), object(2)
memory usage: 42.4+ KB


In [40]:
dh_deduped2.dataframe["left_id"].apply(type)

0       <class 'str'>
1       <class 'str'>
2       <class 'str'>
3       <class 'str'>
4       <class 'str'>
            ...      
2161    <class 'str'>
2162    <class 'str'>
2163    <class 'str'>
2164    <class 'str'>
2165    <class 'str'>
Name: left_id, Length: 2166, dtype: object

In [12]:
dh_deduped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        2166 non-null   object
 1   left         2166 non-null   object
 2   left_id      2166 non-null   object
 3   right        2166 non-null   object
 4   right_id     2166 non-null   object
 5   probability  2166 non-null   int32 
dtypes: int32(1), object(5)
memory usage: 93.2+ KB


In [16]:
dh_deduped._prep_to_cmf(dh_deduped_df)[:5]

  df.assign(


[{'model': 'basic_dh',
  'left': UUID('34f3e1b5-f612-e611-9bdc-e4115bead28a'),
  'right': UUID('6bd85b41-ebd2-43ad-99b9-399fca511176'),
  'probability': 1,
  'sha1': b']\xff\x1c``\xad\t:[\x80\x83\xa6\xc43x\x0f!\n\xc7\x8d'},
 {'model': 'basic_dh',
  'left': UUID('34779711-2a85-4fea-b4e1-07226cc10425'),
  'right': UUID('6f3201cf-d483-4ce2-8c2c-c20e74a11f97'),
  'probability': 1,
  'sha1': b'5\x1c*m&\x96Y\xda\x0c\xfd5\xde\xf9\xf4\x83\t2N@)'},
 {'model': 'basic_dh',
  'left': UUID('35519dfa-3c1a-4389-a452-141e7e84a289'),
  'right': UUID('0a83eefa-68b2-4852-b0fa-edf08828debf'),
  'probability': 1,
  'sha1': b'\xc0\xaf\xe1\x03\xec\xc9\x1a\x98\x1d\xba\xaaV\x88JIw\xfbo\x03\xde'},
 {'model': 'basic_dh',
  'left': UUID('35cb9542-1a51-4f32-b614-c5f77878a3f2'),
  'right': UUID('c3247c4f-4ee1-4500-a43c-61843964bc9e'),
  'probability': 1,
  'sha1': b'\x05\xf8\xba\xad\xd7,\xcaT\xbdVY\x04C\x88a\x9a\xd83x\x93'},
 {'model': 'basic_dh',
  'left': UUID('3606e768-538b-e611-be23-e4115bead28a'),
  'right': U