In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# Deterministic linker

A place to fix and test the deterministic linker.

In [35]:
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.data.validation import Validation
from src.link.deterministic_linker import DeterministicLinker
from src.features.clean_complex import duckdb_cleaning_factory
from src.features.clean_basic_original import (
    cms_original_clean_company_name_general,
    cms_original_clean_company_name_ch,
    cms_original_clean_postcode,
    cms_original_clean_email,
    cms_original_clean_ch_id,
    cms_original_clean_cdms_id
)

from dotenv import load_dotenv, find_dotenv
import os

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

In [12]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
validation = Validation(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("VALIDATE_TABLE")
)

## Prepare data

In [13]:
cl_x_exp = DeterministicLinker(
    name="n1_deterministic_basic",
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=1,
    overwrite=True
)

In [14]:
cl_x_exp.get_data(
    sample=5,
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ],
)

In [16]:
cl_x_exp.dim_raw.head(5)

Unnamed: 0,id,company_name,postcode
0,3142495,427 HARDTOPS LIMITED,CM16 6AP
1,2098624,42 INFORMATION TECHNOLOGY SOLUTIONSLTD,NG33 5EF
2,282341,42 SQUARE (LONDON) LIMITED,SE1 3PA
3,547213,42 TECHNOLOGY HOLDINGS LIMITED,PE27 4LG
4,2153165,42 TECHNOLOGY LIMITED,PE27 4LG


In [44]:
clean_postcode = duckdb_cleaning_factory(cms_original_clean_postcode)
clean_ch_name = duckdb_cleaning_factory(cms_original_clean_company_name_ch)
clean_gen_name = duckdb_cleaning_factory(cms_original_clean_company_name_general)

cluster_pipeline={
    "clean_ch_comp_names": {
        "function": clean_ch_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}
dim_pipeline={
    "clean__comp_names": {
        "function": clean_gen_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}

In [45]:
cl_x_exp._clean_data(
    cluster_pipeline=cluster_pipeline,
    dim_pipeline=dim_pipeline
)

In [46]:
cl_x_exp.dim_processed.head(5)
cl_x_exp.cluster_processed.head(5)

Unnamed: 0,id,company_name,postcode
0,3142495,427hardtops,cm166ap
1,2098624,42informationtechnologysolutions,ng335ef
2,282341,42squarelondon,se13pa
3,547213,42technologyholdings,pe274lg
4,2153165,42technology,pe274lg


Unnamed: 0,id,company_name,postcode
0,9e26ce95-93dd-444d-8f76-a170b7765a05,80808,sp27fu
1,88ce2dec-56c8-4da4-9a5e-e2149b8affc7,8141019,en55yl
2,9dab483f-847c-4752-bfd6-af4ec523dc55,8241526,e148gs
3,e3aba908-96d8-41eb-9d73-cb79639ed2d4,82925,rh101ht
4,d046ba7d-0bda-441d-b141-8b9493d624b2,8461200,sp12as


## Link data