In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uuid
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

import cmf.locations as loc
from cmf import make_deduper, process, query
from cmf.clean import company_name, company_number
from cmf.dedupers import Naive
from cmf.helpers import cleaner, cleaners

InteractiveShell.ast_node_interactivity = "all"

In [39]:
from jinja2 import Template
import json

fields = ["a", "b"]

template = Template("""
    {
        "id": "data_sha1",
        "unique_fields": [
            {% for field in fields %}
                "{{ field }}",
            {% endfor %}
        ]
    }
""")

template.render(fields=fields).strip().replace("\n", "")

json.loads(template.render(fields=fields))

'{        "id": "data_sha1",        "unique_fields": [                            "a",                            "b",                    ]    }'

JSONDecodeError: Expecting value: line 10 column 9 (char 150)

In [38]:
df = pd.read_csv(Path(loc.TEST, "data", "all_companies.csv")).reset_index(names="id")
df["id"] = df["id"].apply(lambda x: uuid.UUID(int=x))

In [77]:
df_raw = df.filter(["company_name", "crn"])
df_crn = pd.concat(
    [
        df_raw.assign(company_name=lambda df: df["company_name"] + " Limited"),
        df_raw.assign(company_name=lambda df: df["company_name"] + " UK"),
        df_raw.assign(company_name=lambda df: df["company_name"] + " Company"),
    ]
)

df_crn["id"] = range(df_crn.shape[0])
df_crn = df_crn.filter(["id", "company_name", "crn"])
df_crn["id"] = df_crn["id"].apply(lambda x: uuid.UUID(int=x))
df_crn.query("company_name.str.lower().str.contains('people')")

Unnamed: 0,id,company_name,crn
0,00000000-0000-0000-0000-000000000000,People Limited,01HHGX9BHARZT77WHVWCYJSWSF
0,00000000-0000-0000-0000-0000000003e8,People UK,01HHGX9BHARZT77WHVWCYJSWSF
0,00000000-0000-0000-0000-0000000007d0,People Company,01HHGX9BHARZT77WHVWCYJSWSF


In [78]:
# Clean
cleaner_name = cleaner(
    function=company_name, arguments={"column": "company_name"}
)
cleaner_crn = cleaners(cleaner_name)

df_cleaned = process(data=df_crn, pipeline=cleaner_crn)

In [79]:
df_cleaned[["company_name", "crn"]].drop_duplicates().shape[0]

1000

In [80]:
# Dedupe
df_naive_deduper = make_deduper(
    dedupe_run_name="basic_crn",
    description="Clean company name, company number",
    deduper=Naive,
    deduper_settings={
        "id": "id",
        "unique_fields": ["company_name", "crn"],
    },
    data_source="foo",
    data=df_cleaned,
)

df_deduped = df_naive_deduper()

df_deduped_df = df_deduped.to_df()


When deduplicating to write back to the Company Matching Framework database, the ID must be data_sha1, generated by retrieving data with cmf.query().


In [81]:
df_deduped_df.shape[0]
df_deduped_df.head(3)

3000

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_crn,foo,00000000-0000-0000-0000-000000000001,foo,00000000-0000-0000-0000-0000000007d1,1
1,basic_crn,foo,00000000-0000-0000-0000-000000000002,foo,00000000-0000-0000-0000-0000000007d2,1
2,basic_crn,foo,00000000-0000-0000-0000-000000000005,foo,00000000-0000-0000-0000-0000000007d5,1


In [82]:
df_deduped.dataframe

Unnamed: 0,left_id,right_id,probability
0,00000000-0000-0000-0000-000000000001,00000000-0000-0000-0000-0000000007d1,1
1,00000000-0000-0000-0000-000000000002,00000000-0000-0000-0000-0000000007d2,1
2,00000000-0000-0000-0000-000000000005,00000000-0000-0000-0000-0000000007d5,1
3,00000000-0000-0000-0000-00000000000b,00000000-0000-0000-0000-0000000007db,1
4,00000000-0000-0000-0000-00000000000c,00000000-0000-0000-0000-0000000007dc,1
...,...,...,...
2995,00000000-0000-0000-0000-0000000003a0,00000000-0000-0000-0000-000000000788,1
2996,00000000-0000-0000-0000-0000000003d1,00000000-0000-0000-0000-0000000007b9,1
2997,00000000-0000-0000-0000-00000000039a,00000000-0000-0000-0000-000000000b6a,1
2998,00000000-0000-0000-0000-000000000782,00000000-0000-0000-0000-000000000b6a,1


In [85]:
df_enriched = df_deduped.inspect_with_source(
    left_data=df_cleaned, left_key="id", right_data=df_cleaned, right_key="id"
)

In [86]:
df_enriched.query("company_name_x == 'people'")

Unnamed: 0,left_id,right_id,company_name_x,crn_x,company_name_y,crn_y
745,00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-0000000007d0,people,01HHGX9BHARZT77WHVWCYJSWSF,people,01HHGX9BHARZT77WHVWCYJSWSF
1989,00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-0000000003e8,people,01HHGX9BHARZT77WHVWCYJSWSF,people,01HHGX9BHARZT77WHVWCYJSWSF
2477,00000000-0000-0000-0000-0000000003e8,00000000-0000-0000-0000-0000000007d0,people,01HHGX9BHARZT77WHVWCYJSWSF,people,01HHGX9BHARZT77WHVWCYJSWSF


In [71]:
import duckdb

df_cleaned_2 = df_cleaned.copy()

join_clause = []
for field in ["company_name", "crn"]:
    join_clause.append(f"l.{field} = r.{field}")
join_clause_compiled = " and ".join(join_clause)

df_cleaned_2["_unique_e4003b"] = range(df_cleaned_2.shape[0])

duckdb.sql(
    f"""
    select distinct on (list_sort([raw.left_id, raw.right_id]))
        raw.left_id,
        raw.right_id,
        raw.left_unique,
        raw.right_unique,
        1 as probability
    from (
        select
            l.id as left_id,
            r.id as right_id,
            l._unique_e4003b as left_unique,
            r._unique_e4003b as right_unique
        from
            df_cleaned_2 l
        inner join df_cleaned_2 r on
            (
                {join_clause_compiled}
            )
    ) raw;
"""
).df()

Unnamed: 0,left_id,right_id,left_unique,right_unique,probability
0,00000000-0000-0000-0000-000000000000,00000000-0000-0000-0000-000000000000,0,2000,1
1,00000000-0000-0000-0000-000000000003,00000000-0000-0000-0000-000000000003,3,2003,1
2,00000000-0000-0000-0000-000000000006,00000000-0000-0000-0000-000000000006,6,2006,1
3,00000000-0000-0000-0000-000000000008,00000000-0000-0000-0000-000000000008,8,2008,1
4,00000000-0000-0000-0000-00000000000f,00000000-0000-0000-0000-00000000000f,15,2015,1
...,...,...,...,...,...
995,00000000-0000-0000-0000-00000000030f,00000000-0000-0000-0000-00000000030f,783,2783,1
996,00000000-0000-0000-0000-000000000342,00000000-0000-0000-0000-000000000342,834,2834,1
997,00000000-0000-0000-0000-000000000367,00000000-0000-0000-0000-000000000367,871,2871,1
998,00000000-0000-0000-0000-000000000372,00000000-0000-0000-0000-000000000372,882,2882,1


In [3]:
# Select
dh = query(
    selector={
        "dit.data_hub__companies": [
            "id",
            "name",
            "company_number",
        ]
    },
    model=None,
    return_type="pandas",
)

# Clean
col_prefix = "dit_data_hub__companies_"

cleaner_name = cleaner(function=company_name, arguments={"column": f"{col_prefix}name"})
cleaner_crn = cleaner(
    function=company_number, arguments={"column": f"{col_prefix}company_number"}
)
cleaner_name_dh = cleaners(cleaner_name, cleaner_crn)

dh_cleaned = process(data=dh, pipeline=cleaner_name_dh)

In [4]:
dh.shape
dh_cleaned.shape

(503449, 4)

(503449, 4)

In [5]:
dh_cleaned.head(3)

Unnamed: 0,data_sha1,dit_data_hub__companies_id,dit_data_hub__companies_name,dit_data_hub__companies_company_number
0,"[196, 247, 190, 128, 184, 190, 103, 122, 20, 4...",00002c8e-591a-e711-88ee-e4115bead28a,arensis corp,
1,"[24, 61, 93, 182, 46, 163, 186, 32, 56, 37, 47...",000042c1-a098-e211-a939-e4115bead28a,macrogen korea,
2,"[88, 139, 37, 72, 135, 153, 140, 176, 249, 217...",00008a29-e155-e411-985c-e4115bead28a,pixsan digital software,


In [9]:
dh_cleaned[
    ["dit_data_hub__companies_name", "dit_data_hub__companies_company_number"]
].drop_duplicates().shape[0]

482602

In [6]:
# Dedupe
dh_naive_deduper = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": f"data_sha1",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped = dh_naive_deduper()

dh_deduped_df = dh_deduped.to_df()

In [7]:
dh_deduped_df.head(5)

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_dh,dit.data_hub__companies,"[159, 88, 93, 114, 229, 226, 159, 80, 204, 168...",dit.data_hub__companies,"[134, 155, 152, 206, 81, 64, 135, 99, 204, 197...",1
1,basic_dh,dit.data_hub__companies,"[5, 206, 209, 57, 155, 53, 4, 205, 147, 11, 13...",dit.data_hub__companies,"[9, 74, 19, 18, 34, 88, 59, 107, 19, 210, 37, ...",1
2,basic_dh,dit.data_hub__companies,"[85, 19, 108, 225, 134, 92, 105, 217, 244, 86,...",dit.data_hub__companies,"[231, 248, 107, 96, 178, 253, 194, 43, 216, 8,...",1
3,basic_dh,dit.data_hub__companies,"[72, 218, 48, 38, 233, 143, 9, 226, 204, 151, ...",dit.data_hub__companies,"[25, 88, 248, 207, 122, 85, 22, 187, 41, 99, 4...",1
4,basic_dh,dit.data_hub__companies,"[95, 15, 232, 74, 123, 239, 149, 20, 69, 201, ...",dit.data_hub__companies,"[236, 26, 4, 121, 247, 204, 59, 212, 162, 32, ...",1


In [11]:
dh_deduped.dataframe["left_id"]

0       [159, 88, 93, 114, 229, 226, 159, 80, 204, 168...
1       [5, 206, 209, 57, 155, 53, 4, 205, 147, 11, 13...
2       [85, 19, 108, 225, 134, 92, 105, 217, 244, 86,...
3       [72, 218, 48, 38, 233, 143, 9, 226, 204, 151, ...
4       [95, 15, 232, 74, 123, 239, 149, 20, 69, 201, ...
                              ...                        
2161    [170, 14, 152, 42, 218, 117, 226, 101, 119, 18...
2162    [117, 142, 93, 47, 102, 98, 70, 24, 135, 242, ...
2163    [26, 121, 21, 138, 127, 213, 138, 94, 227, 191...
2164    [101, 86, 133, 145, 94, 225, 224, 86, 213, 43,...
2165    [223, 178, 145, 11, 190, 234, 71, 40, 27, 80, ...
Name: left_id, Length: 2166, dtype: object

In [8]:
dh_deduped.dataframe["left_id"].astype(bytes)

ValueError: setting an array element with a sequence

In [12]:
dh_deduped.dataframe.filter(["left_id", "right_id"]).map(bytes)

Unnamed: 0,left_id,right_id
0,b'\x9fX]r\xe5\xe2\x9fP\xcc\xa8\xaaL~\xa1\\\xfc...,b'\x86\x9b\x98\xceQ@\x87c\xcc\xc5\xcb]\xfeA\xf...
1,b'\x05\xce\xd19\x9b5\x04\xcd\x93\x0b\x89\xe7^\...,"b'\tJ\x13\x12""X;k\x13\xd2%\x0cj\x18\xe6\x9e\x1..."
2,b'U\x13l\xe1\x86\\i\xd9\xf4V\x95\x8d\x8aB\x1d\...,b'\xe7\xf8k`\xb2\xfd\xc2+\xd8\x08\xa0\xb4\xd2\...
3,b'H\xda0&\xe9\x8f\t\xe2\xcc\x97\x03C|bv\x9b\x0...,b'\x19X\xf8\xcfzU\x16\xbb)c(F\x85\x0e\xf0AJ\xf...
4,b'_\x0f\xe8J{\xef\x95\x14E\xc9\xa2\x1e5;*>\xd2...,b'\xec\x1a\x04y\xf7\xcc;\xd4\xa2 \xdfH\xa4\xe3...
...,...,...
2161,b'\xaa\x0e\x98*\xdau\xe2ew\xb4\x85S[\xdfb\xb1\...,b'\xf1\x8e@\x86\xc1\xab\xd1\xda\xe6\x8c\x80v\x...
2162,b'u\x8e]/fbF\x18\x87\xf2\r\x86\xf8\x95\xdd\xb8...,b'\x92\x8c(\xbd\xbf\x06\xc4\xcbJCu\x17\xe9\x89...
2163,b'\x1ay\x15\x8a\x7f\xd5\x8a^\xe3\xbf\x1b\x1d(\...,b'\xb1\x03\x01\x86\x16\x85\x8dT/\xe7}j\xc4~q\x...
2164,b'eV\x85\x91^\xe1\xe0V\xd5+\xba\xb0\xd0L&\xc4=...,b'\x0eEp\x89\x1d;\xa2\x97\xd3} CRN\xa6\xed\x8f...


In [42]:
# Dedupe
dh_naive_deduper2 = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": "data_sha1",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped2 = dh_naive_deduper2()

dh_deduped_df2 = dh_deduped2.to_df()

In [66]:
from sqlalchemy.orm import Session

from cmf.data import ENGINE, SourceData

with Session(ENGINE) as session:
    data_inner_join = session.query(SourceData).limit(10).all()

In [67]:
data_inner_join

[<cmf.data.data.SourceData at 0x7f8acbd4af40>,
 <cmf.data.data.SourceData at 0x7f8acbd4afa0>,
 <cmf.data.data.SourceData at 0x7f8acbd4a130>,
 <cmf.data.data.SourceData at 0x7f8acbd54070>,
 <cmf.data.data.SourceData at 0x7f8acbd540d0>,
 <cmf.data.data.SourceData at 0x7f8acbd54130>,
 <cmf.data.data.SourceData at 0x7f8acbd54190>,
 <cmf.data.data.SourceData at 0x7f8acbd541f0>,
 <cmf.data.data.SourceData at 0x7f8acbd54250>,
 <cmf.data.data.SourceData at 0x7f8acbd542b0>]

In [48]:
bytearray(dh_deduped2.dataframe["left_id"][0].encode())
dh_deduped2.dataframe["left_id"][0]
dh_deduped2.dataframe.info()

bytearray(b"bytearray(b\'\\x0c\\xa6*\\x8e\\x00:\\xd7\\xd9^\\x0fF\\x82\\xa7\\x89}\\xe6Fb\\x93\\x87\')")

"bytearray(b'\\x0c\\xa6*\\x8e\\x00:\\xd7\\xd9^\\x0fF\\x82\\xa7\\x89}\\xe6Fb\\x93\\x87')"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   left_id      2166 non-null   object
 1   right_id     2166 non-null   object
 2   probability  2166 non-null   int32 
dtypes: int32(1), object(2)
memory usage: 42.4+ KB


In [40]:
dh_deduped2.dataframe["left_id"].apply(type)

0       <class 'str'>
1       <class 'str'>
2       <class 'str'>
3       <class 'str'>
4       <class 'str'>
            ...      
2161    <class 'str'>
2162    <class 'str'>
2163    <class 'str'>
2164    <class 'str'>
2165    <class 'str'>
Name: left_id, Length: 2166, dtype: object

In [12]:
dh_deduped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        2166 non-null   object
 1   left         2166 non-null   object
 2   left_id      2166 non-null   object
 3   right        2166 non-null   object
 4   right_id     2166 non-null   object
 5   probability  2166 non-null   int32 
dtypes: int32(1), object(5)
memory usage: 93.2+ KB


In [16]:
dh_deduped._prep_to_cmf(dh_deduped_df)[:5]

  df.assign(


[{'model': 'basic_dh',
  'left': UUID('34f3e1b5-f612-e611-9bdc-e4115bead28a'),
  'right': UUID('6bd85b41-ebd2-43ad-99b9-399fca511176'),
  'probability': 1,
  'sha1': b']\xff\x1c``\xad\t:[\x80\x83\xa6\xc43x\x0f!\n\xc7\x8d'},
 {'model': 'basic_dh',
  'left': UUID('34779711-2a85-4fea-b4e1-07226cc10425'),
  'right': UUID('6f3201cf-d483-4ce2-8c2c-c20e74a11f97'),
  'probability': 1,
  'sha1': b'5\x1c*m&\x96Y\xda\x0c\xfd5\xde\xf9\xf4\x83\t2N@)'},
 {'model': 'basic_dh',
  'left': UUID('35519dfa-3c1a-4389-a452-141e7e84a289'),
  'right': UUID('0a83eefa-68b2-4852-b0fa-edf08828debf'),
  'probability': 1,
  'sha1': b'\xc0\xaf\xe1\x03\xec\xc9\x1a\x98\x1d\xba\xaaV\x88JIw\xfbo\x03\xde'},
 {'model': 'basic_dh',
  'left': UUID('35cb9542-1a51-4f32-b614-c5f77878a3f2'),
  'right': UUID('c3247c4f-4ee1-4500-a43c-61843964bc9e'),
  'probability': 1,
  'sha1': b'\x05\xf8\xba\xad\xd7,\xcaT\xbdVY\x04C\x88a\x9a\xd83x\x93'},
 {'model': 'basic_dh',
  'left': UUID('3606e768-538b-e611-be23-e4115bead28a'),
  'right': U