In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import cProfile
import hashlib
import uuid
from pathlib import Path

import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

import cmf.locations as loc
from cmf import make_deduper, process, query
from cmf.clean import company_name, company_number
from cmf.data import utils as du
from cmf.dedupers import Naive
from cmf.helpers import cleaner, cleaners

InteractiveShell.ast_node_interactivity = "all"

In [3]:
df = pd.read_csv(Path(loc.TEST, "data", "all_companies.csv")).reset_index(names="id")
df["id"] = df["id"].apply(lambda x: uuid.UUID(int=x))

In [47]:
sha1_series_1 = du.columns_to_value_ordered_sha1(
    data=df.head(10), columns=["id", "company_name", "crn", "duns", "cdms"]
)
sha1_series_1

0    b"\x86{\x8fL\xffDQ?.('\x9d\xeb\xd9j'\x1b\x85]\...
1    b'\xad\xff\x98\xd2\xa2\xa0\x0c\x06D\x84\xe9\xc...
2    b'\xc6\x04/\x89\x89\xca\xb3\x95q\x87\xa4\x9a?A...
3    b'\xa2\xd2\xc7\x99\xd3\x08t0\x84\x81\xed\x1f\x...
4    b'\x8d=}\x16\r\xfa\xf7\x1d/M\xfd\xeaG\x8dM1\x8...
5      b"D\x03\x123\x7f9\x1f'W\xeb&Ud\xb9]])*\x80\xdd"
6    b'\xa3\x99\xf6x2K\xc4\x131\x05qW\xabW\xc5r\x13...
7    b'7\xb6h1[\xfet\xcb\xe2Hk4V\x9c\xb4\x7f\xf7\xd...
8    b'\xd5\x83\xd3\xc3\xc6\xb8\x07\xff\xe6<Qcc)\xa...
9    b'\xf4N\xe0\xadA(\x9ew\xaf+\xc2\x8c\x89\xca0o\...
dtype: object

In [32]:
records = du.columns_to_value_ordered_sha1(
    data=df, columns=["id", "company_name", "crn", "duns", "cdms"]
)

In [39]:
df.columns

Index(['id', 'company_name', 'crn', 'address', 'duns', 'cdms'], dtype='object')

In [4]:
top = (
    df.head(500)
    .rename(
        columns={
            "company_name": "address",
            "address": "company_name",
            "duns": "crn",
            "crn": "duns",
        }
    )
    .filter(["id", "company_name", "crn", "address", "duns", "cdms"])
)
df2 = pd.concat([top, df.tail(500)])

In [23]:
df.head(1)
sha1_series_1 = du.columns_to_value_ordered_sha1(
    data=df.head(1), columns=["id", "company_name", "address", "crn", "duns", "cdms"]
)
sha1_series_1[0]

Unnamed: 0,id,company_name,crn,address,duns,cdms
0,00000000-0000-0000-0000-000000000000,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF,Room 577,111111111,ORG-55555555


[b'00000000-0000-0000-0000-000000000000', b'01HHGX9BHARZT77WHVWCYJSWSF', b'111111111', b'Dooley Inc', b'ORG-55555555', b'Room 577']


b'\xdc`\xe6E\xa9R\x16\x98\xd04\xcfO\x85\xf2\x1a\xfc\xe9\xffT\x87'

In [24]:
df2.head(1)
sha1_series_2 = du.columns_to_value_ordered_sha1(
    data=df2.head(1), columns=["id", "company_name", "address", "crn", "duns", "cdms"]
)
sha1_series_2[0]

Unnamed: 0,id,company_name,crn,address,duns,cdms
0,00000000-0000-0000-0000-000000000000,Room 577,111111111,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF,ORG-55555555


[b'00000000-0000-0000-0000-000000000000', b'01HHGX9BHARZT77WHVWCYJSWSF', b'111111111', b'Dooley Inc', b'ORG-55555555', b'Room 577']


b'\xdc`\xe6E\xa9R\x16\x98\xd04\xcfO\x85\xf2\x1a\xfc\xe9\xffT\x87'

In [9]:
bytes_records = df.head(1).astype(bytes).to_dict("records")
hashed_records = []

for record in bytes_records:
    sorted_vals = sorted(list(record.values()))
    hashed_vals_list = [hashlib.sha1(i) for i in sorted_vals]

    hashed_vals = hashed_vals_list[0]
    for val in hashed_vals_list[1:]:
        hashed_vals.update(val.digest())

    hashed_records.append(hashed_vals.digest())

df.head(1)
sorted_vals
pd.Series(hashed_records)[0]

Unnamed: 0,id,company_name,crn,address,duns,cdms
0,00000000-0000-0000-0000-000000000000,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF,Room 577,111111111,ORG-55555555


[b'00000000-0000-0000-0000-000000000000',
 b'01HHGX9BHARZT77WHVWCYJSWSF',
 b'111111111',
 b'Dooley Inc',
 b'ORG-55555555',
 b'Room 577']

b'\xdc`\xe6E\xa9R\x16\x98\xd04\xcfO\x85\xf2\x1a\xfc\xe9\xffT\x87'

In [10]:
bytes_records = df2.head(1).astype(bytes).to_dict("records")
hashed_records = []

for record in bytes_records:
    sorted_vals = sorted(list(record.values()))
    hashed_vals_list = [hashlib.sha1(i) for i in sorted_vals]

    hashed_vals = hashed_vals_list[0]
    for val in hashed_vals_list[1:]:
        hashed_vals.update(val.digest())

    hashed_records.append(hashed_vals.digest())

df2.head(1)
sorted_vals
pd.Series(hashed_records)[0]

Unnamed: 0,id,company_name,crn,address,duns,cdms
0,00000000-0000-0000-0000-000000000000,Room 577,111111111,Dooley Inc,01HHGX9BHARZT77WHVWCYJSWSF,ORG-55555555


[b'00000000-0000-0000-0000-000000000000',
 b'01HHGX9BHARZT77WHVWCYJSWSF',
 b'111111111',
 b'Dooley Inc',
 b'ORG-55555555',
 b'Room 577']

b'\xdc`\xe6E\xa9R\x16\x98\xd04\xcfO\x85\xf2\x1a\xfc\xe9\xffT\x87'

In [33]:
for record in records:
    sorted_i = sorted(list(record.values()))
    out_hash = hashlib.sha1()
    for val in sorted_i:
        out_hash.update(val)

In [None]:
cProfile.run(
    """\
du.columns_to_value_ordered_sha1(
    data=df.head(10),
    columns=["id", "company_name", "crn", "duns", "cdms"]
)""",
    sort="tottime",
)

In [25]:
# Select
dh = query(
    selector={
        "dit.data_hub__companies": [
            "id",
            "name",
            "company_number",
        ]
    },
    model=None,
    return_type="pandas",
)

# Clean
col_prefix = "dit_data_hub__companies_"

cleaner_name = cleaner(function=company_name, arguments={"column": f"{col_prefix}name"})
cleaner_crn = cleaner(
    function=company_number, arguments={"column": f"{col_prefix}company_number"}
)
cleaner_name_dh = cleaners(cleaner_name, cleaner_crn)

dh_cleaned = process(data=dh, pipeline=cleaner_name_dh)

  class Dedupes(SHA1Mixin, CMFBase):
[autoreload of cmf.data.dedupe failed: Traceback (most recent call last):
  File "/opt/conda/envs/company_matching/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/conda/envs/company_matching/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/opt/conda/envs/company_matching/lib/python3.9/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 613, in _exec
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "/home/jovyan/company-matching/cmf/data/dedupe.py", line 15, in <module>
    class Dedupes(SHA1Mixin, CMFBase):
  File "/opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/decl_api.py", l

In [26]:
dh.shape
dh_cleaned.shape

(503449, 4)

(503449, 4)

In [27]:
dh_cleaned.head(3)

Unnamed: 0,data_sha1,dit_data_hub__companies_id,dit_data_hub__companies_name,dit_data_hub__companies_company_number
0,"[196, 247, 190, 128, 184, 190, 103, 122, 20, 4...",00002c8e-591a-e711-88ee-e4115bead28a,arensis corp,
1,"[24, 61, 93, 182, 46, 163, 186, 32, 56, 37, 47...",000042c1-a098-e211-a939-e4115bead28a,macrogen korea,
2,"[88, 139, 37, 72, 135, 153, 140, 176, 249, 217...",00008a29-e155-e411-985c-e4115bead28a,pixsan digital software,


In [9]:
dh_cleaned[
    ["dit_data_hub__companies_name", "dit_data_hub__companies_company_number"]
].drop_duplicates().shape[0]

482602

In [28]:
# Dedupe
dh_naive_deduper = make_deduper(
    dedupe_run_name="basic_dh",
    description="""
        Clean company name, company number
    """,
    deduper=Naive,
    deduper_settings={
        "id": f"{col_prefix}id",
        "unique_fields": [f"{col_prefix}name", f"{col_prefix}company_number"],
    },
    data_source="dit.data_hub__companies",
    data=dh_cleaned,
)

dh_deduped = dh_naive_deduper()

dh_deduped_df = dh_deduped.to_df()

In [29]:
dh_deduped_df.head(5)

Unnamed: 0,model,left,left_id,right,right_id,probability
0,basic_dh,dit.data_hub__companies,0085222b-68be-e311-a3d5-e4115bead28a,dit.data_hub__companies,10c93ac7-a098-e211-a939-e4115bead28a,1
1,basic_dh,dit.data_hub__companies,0115403a-ea34-e411-985c-e4115bead28a,dit.data_hub__companies,0d3a2831-da11-4158-b2be-da9a991139bc,1
2,basic_dh,dit.data_hub__companies,018649bb-a098-e211-a939-e4115bead28a,dit.data_hub__companies,d41b30ca-fae9-e611-bca1-e4115bead28a,1
3,basic_dh,dit.data_hub__companies,01b049bb-a098-e211-a939-e4115bead28a,dit.data_hub__companies,d439a050-10b1-e411-a839-e4115bead28a,1
4,basic_dh,dit.data_hub__companies,02987d74-681f-4558-b248-24023bcab4cf,dit.data_hub__companies,2f524795-c9cd-4b89-aa38-2e4d16ef0b7a,1


In [12]:
dh_deduped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166 entries, 0 to 2165
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        2166 non-null   object
 1   left         2166 non-null   object
 2   left_id      2166 non-null   object
 3   right        2166 non-null   object
 4   right_id     2166 non-null   object
 5   probability  2166 non-null   int32 
dtypes: int32(1), object(5)
memory usage: 93.2+ KB


In [16]:
dh_deduped._prep_to_cmf(dh_deduped_df)[:5]

  df.assign(


[{'model': 'basic_dh',
  'left': UUID('34f3e1b5-f612-e611-9bdc-e4115bead28a'),
  'right': UUID('6bd85b41-ebd2-43ad-99b9-399fca511176'),
  'probability': 1,
  'sha1': b']\xff\x1c``\xad\t:[\x80\x83\xa6\xc43x\x0f!\n\xc7\x8d'},
 {'model': 'basic_dh',
  'left': UUID('34779711-2a85-4fea-b4e1-07226cc10425'),
  'right': UUID('6f3201cf-d483-4ce2-8c2c-c20e74a11f97'),
  'probability': 1,
  'sha1': b'5\x1c*m&\x96Y\xda\x0c\xfd5\xde\xf9\xf4\x83\t2N@)'},
 {'model': 'basic_dh',
  'left': UUID('35519dfa-3c1a-4389-a452-141e7e84a289'),
  'right': UUID('0a83eefa-68b2-4852-b0fa-edf08828debf'),
  'probability': 1,
  'sha1': b'\xc0\xaf\xe1\x03\xec\xc9\x1a\x98\x1d\xba\xaaV\x88JIw\xfbo\x03\xde'},
 {'model': 'basic_dh',
  'left': UUID('35cb9542-1a51-4f32-b614-c5f77878a3f2'),
  'right': UUID('c3247c4f-4ee1-4500-a43c-61843964bc9e'),
  'probability': 1,
  'sha1': b'\x05\xf8\xba\xad\xd7,\xcaT\xbdVY\x04C\x88a\x9a\xd83x\x93'},
 {'model': 'basic_dh',
  'left': UUID('3606e768-538b-e611-be23-e4115bead28a'),
  'right': U