In [None]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# Deterministic linker

A place to fix and test the deterministic linker.

In [None]:
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.data.validation import Validation
from src.link.deterministic_linker import DeterministicLinker
from src.config import link_pipeline, stopwords
from src.features.clean_complex import clean_comp_names

from dotenv import load_dotenv, find_dotenv
import os

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

In [None]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
validation = Validation(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("VALIDATE_TABLE")
)

## Prepare data

In [None]:
cl_x_exp = DeterministicLinker(
    name="n1_deterministic_basic",
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=1,
    overwrite=True
)

In [None]:
cl_x_exp.get_data(
    # sample=5,
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ],
)

In [None]:
cluster_pipeline={
    "clean_comp_names": {
        "function": clean_comp_names,
        "arguments": {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords,
        },
    }
}
dim_pipeline={
    "clean_comp_names": {
        "function": clean_comp_names,
        "arguments": {
            "primary_col": "company_name",
            "secondary_col": None,
            "stopwords": stopwords,
        },
    }
}

## Link data