In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# Deterministic linker

A place to fix and test the deterministic linker.

In [35]:
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.data.validation import Validation
from src.link.deterministic_linker import DeterministicLinker
from src.features.clean_complex import duckdb_cleaning_factory
from src.features.clean_basic_original import (
    cms_original_clean_company_name_general,
    cms_original_clean_company_name_ch,
    cms_original_clean_postcode,
    cms_original_clean_email,
    cms_original_clean_ch_id,
    cms_original_clean_cdms_id
)

from dotenv import load_dotenv, find_dotenv
import os

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

In [12]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
validation = Validation(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("VALIDATE_TABLE")
)

## Prepare data

In [13]:
cl_x_exp = DeterministicLinker(
    name="n1_deterministic_basic",
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=1,
    overwrite=True
)

In [14]:
cl_x_exp.get_data(
    sample=5,
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ],
)

In [16]:
cl_x_exp.dim_raw.head(5)

Unnamed: 0,id,company_name,postcode
0,3142495,427 HARDTOPS LIMITED,CM16 6AP
1,2098624,42 INFORMATION TECHNOLOGY SOLUTIONSLTD,NG33 5EF
2,282341,42 SQUARE (LONDON) LIMITED,SE1 3PA
3,547213,42 TECHNOLOGY HOLDINGS LIMITED,PE27 4LG
4,2153165,42 TECHNOLOGY LIMITED,PE27 4LG


In [40]:
clean_postcode = duckdb_cleaning_factory(cms_original_clean_postcode)
clean_ch_name = duckdb_cleaning_factory(cms_original_clean_company_name_ch)
clean_gen_name = duckdb_cleaning_factory(cms_original_clean_company_name_general)
clean_composite_name = duckdb_cleaning_factory([
    cms_original_clean_company_name_ch,
    cms_original_clean_company_name_general
])

In [37]:
clean_postcode(
    df=cl_x_exp.dim_raw,
    column='postcode'
)

Unnamed: 0,id,company_name,postcode
0,3142495,427 HARDTOPS LIMITED,cm166ap
1,2098624,42 INFORMATION TECHNOLOGY SOLUTIONSLTD,ng335ef
2,282341,42 SQUARE (LONDON) LIMITED,se13pa
3,547213,42 TECHNOLOGY HOLDINGS LIMITED,pe274lg
4,2153165,42 TECHNOLOGY LIMITED,pe274lg
...,...,...,...
13725,2299247,ZESPOKE DESIGN LTD,bt808uq
13726,2390001,ZEST4.TV LIMITED,tw167dx
13727,2243490,ZEST AND ZING LIMITED,n134bs
13728,1065513,ZESTBEAUTY.COM,ng27pl


In [38]:
clean_ch_name(
    df=cl_x_exp.dim_raw,
    column='company_name'
)

Unnamed: 0,id,company_name,postcode
0,3142495,427hardtops,CM16 6AP
1,2098624,42informationtechnologysolutions,NG33 5EF
2,282341,42squarelondon,SE1 3PA
3,547213,42technologyholdings,PE27 4LG
4,2153165,42technology,PE27 4LG
...,...,...,...
13725,2299247,zespokedesign,BT80 8UQ
13726,2390001,zest4tv,TW16 7DX
13727,2243490,zestzing,N13 4BS
13728,1065513,zestbeautycom,NG2 7PL


In [39]:
clean_gen_name(
    df=cl_x_exp.dim_raw,
    column='company_name'
)

Unnamed: 0,id,company_name,postcode
0,3142495,427hardtops,CM16 6AP
1,2098624,42informationtechnologysolutions,NG33 5EF
2,282341,42squarelondon,SE1 3PA
3,547213,42technologyholdings,PE27 4LG
4,2153165,42technology,PE27 4LG
...,...,...,...
13725,2299247,zespokedesign,BT80 8UQ
13726,2390001,zest4tv,TW16 7DX
13727,2243490,zestzing,N13 4BS
13728,1065513,zestbeautycom,NG2 7PL


In [41]:
clean_composite_name(
    df=cl_x_exp.dim_raw,
    column='company_name'
)

Unnamed: 0,id,company_name,postcode
0,3142495,427hardtops,CM16 6AP
1,2098624,42informationtechnologysolutions,NG33 5EF
2,282341,42squarelondon,SE1 3PA
3,547213,42technologyholdings,PE27 4LG
4,2153165,42technology,PE27 4LG
...,...,...,...
13725,2299247,zespokedesign,BT80 8UQ
13726,2390001,zest4tv,TW16 7DX
13727,2243490,zestzing,N13 4BS
13728,1065513,zestbeautycom,NG2 7PL


In [None]:
cluster_pipeline={
    "clean_comp_names": {
        "function": clean_comp_names,
        "arguments": {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords,
        },
    }
}
dim_pipeline={
    "clean_comp_names": {
        "function": clean_comp_names,
        "arguments": {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords,
        },
    }
}

## Link data