In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [17]:
from cmf.clean import cleaning_function
from cmf.clean import steps
from cmf.data import utils as du
from cmf.data import Table
from cmf import make_cmf_connection

from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

cmf_conn = make_cmf_connection()

df = du.generate_dummy_df()

df

True

Unnamed: 0,irrational,rounded
0,pi,3.14
1,e,2.72
2,phi,1.62


In [9]:
def to_upper(column: str) -> str:
    """
    All characters to uppercase
    """
    return f"upper({column})"

logger = logging.getLogger(__name__)

logger.info(
    "Creating probabilities table "
    "test"
)

In [31]:
clusters._db_expected_fields

['uuid', 'id', 'cluster', 'source', 'n']

In [7]:
x = set(["id", "uuid", "name"])
y = set(["id", "uuid", "name"])
z = set(["id"])
a = set(["id", "foo"])

y <= x
z <= x
a <= x

True

True

False

In [5]:
x = {"foo": "bar", "baz": "quz"}
y = {"foo": "bar"}
len(x)
len(y)
for i, j in x.items():
    print(i)
    print(j)

2

1

foo
bar
baz
quz


In [8]:
from pydantic import BaseModel

class Test(BaseModel):
    x: str
    y: str

    @classmethod
    def make_test(cls, foo: str) -> "Test":
        return cls(
            x=foo[0],
            y=foo[-1]
        )

test = Test.make_test(foo="lorem")
test.x
test.y

'l'

'm'

In [15]:
p, q = tuple(y.items())[0]
p
q

'foo'

'bar'

In [26]:
from cmf.helpers import comparison, comparisons
from icecream import ic

comparison_name = comparison(sql_condition="company_name = company_name")
comparison_id = comparison(sql_condition="data_hub_id = data_hub_id")
comparison_name_id = comparisons(comparison_name, comparison_id)

comparison_name_id

{'comparisons': [{'comparison': 'company_name = company_name'},
  {'comparison': 'data_hub_id = data_hub_id'}]}

In [19]:
from cmf import query, process
from cmf.helpers import selector, cleaner, cleaners
from cmf.clean import company_name, company_number

select_ch = selector(
    table="companieshouse.companies", fields=["company_number", "company_name"]
)
ch_sample = query(select=select_ch, sample=0.05)

cleaner_name = cleaner(function=company_name, arguments={"column": "company_name"})
cleaner_number = cleaner(
    function=company_number, arguments={"column": "company_number"}
)
cleaner_name_number = cleaners(cleaner_name, cleaner_number)

ch_sample_cleaned = process(data=ch_sample, pipeline=cleaner_name_number)

In [5]:
from cmf import query, process, make_deduper
from cmf.dedupers import Naive
from cmf.helpers import selector, cleaner, cleaners
from cmf.clean import company_name, postcode_to_area

# Select
select_exp = selector(
    table="hmrc.trade__exporters", 
    fields=["id", "company_name", "postcode"]
)
exp_sample = query(select=select_exp, sample=0.05)

# Clean
cleaner_name = cleaner(function=company_name, arguments={"column": "company_name"})
cleaner_pc = cleaner(
    function=postcode_to_area, arguments={"column": "postcode"}
)
cleaner_name_pc = cleaners(cleaner_name, cleaner_pc)

exp_sample_cleaned = process(data=exp_sample, pipeline=cleaner_name_pc)

In [6]:
exp_sample_cleaned

Unnamed: 0,id,company_name,postcode
0,47248,burgess furniture,TW
1,47249,hapton valley commercials,BB
2,47250,discovery yachts,SO
3,47251,holroyd precision,OL
4,47252,h g hannant,NR
...,...,...,...
2235,3563933,delapena honing equipment,GL
2236,3563934,i koncepts,BH
2237,3563935,fairfax saddles,WS
2238,3563936,triark pumps,CM


In [71]:
from cmf.data import utils as du

du.query_nonreturn("""
    drop table if exists _user_eaf4fd9a.cm_probabilities;
""")

In [68]:
import duckdb

id = "id"
unique_fields = ["company_name", "postcode"]
join_clause = []
for field in unique_fields:
    join_clause.append(f"l.{field} = r.{field}")
join_clause_compiled = " and ".join(join_clause)

duckdb.sql(
    f"""
    select 
        l.{id}::text as target_id,
        r.{id}::text as source_id,
        1 as probability
    from
        exp_sample_cleaned l
    inner join exp_sample_cleaned r on
        (
            {join_clause_compiled}
        ) and
            l.{id} != r.{id}
"""
)

┌───────────┬───────────┬─────────────┐
│ target_id │ source_id │ probability │
│  varchar  │  varchar  │    int32    │
├───────────┼───────────┼─────────────┤
│ 400720    │ 3215046   │           1 │
│ 439303    │ 3481079   │           1 │
│ 439306    │ 3481080   │           1 │
│ 439310    │ 3481082   │           1 │
│ 439316    │ 3481103   │           1 │
│ 1276365   │ 3509884   │           1 │
│ 1276366   │ 3509887   │           1 │
│ 1276371   │ 3509896   │           1 │
│ 1276374   │ 3509902   │           1 │
│ 1276375   │ 3509903   │           1 │
│ 1382468   │ 1635734   │           1 │
│ 1635734   │ 1382468   │           1 │
│ 3215046   │ 400720    │           1 │
│ 3481079   │ 439303    │           1 │
│ 3481080   │ 439306    │           1 │
│ 3481082   │ 439310    │           1 │
│ 3481103   │ 439316    │           1 │
│ 3509884   │ 1276365   │           1 │
│ 3509887   │ 1276366   │           1 │
│ 3509896   │ 1276371   │           1 │
│ 3509902   │ 1276374   │           1 │


In [46]:
matched

┌──────────────────────────────────────┬─────────┬─────────┬─────────────┐
│               cluster                │  l_id   │  r_id   │ probability │
│                 uuid                 │  int64  │  int64  │    int32    │
├──────────────────────────────────────┼─────────┼─────────┼─────────────┤
│ 228ae7c0-ac37-41dd-bcdd-d57523ad753b │  400720 │ 3215046 │           1 │
│ b97d14ba-4290-4126-8f4a-10b7b9b753e0 │  439303 │ 3481079 │           1 │
│ 0c70853a-4899-44ac-b38d-a2bf7e7e6083 │  439306 │ 3481080 │           1 │
│ b675452f-4caf-4dbd-9692-09d0a13aaf97 │  439310 │ 3481082 │           1 │
│ 320b09f4-80a6-4b85-9005-e10c05894e71 │  439316 │ 3481103 │           1 │
│ b58ba5cc-71f5-4bf8-a1cb-24eeef1a3c9d │ 1276365 │ 3509884 │           1 │
│ 2cddfb7f-27dd-47f9-8d39-c4fc47917d39 │ 1276366 │ 3509887 │           1 │
│ d4d2d180-c73d-4d79-acb2-b196bc58f9db │ 1276371 │ 3509896 │           1 │
│ ac66276d-a6de-4b80-b37e-193d02b91b9a │ 1276374 │ 3509902 │           1 │
│ e4ae51d2-4adb-4cc5-b03c

In [57]:
# matched
duckdb.sql("""
    select 
        * 
    from 
        stacked 
    where 
        cluster = '228ae7c0-ac37-41dd-bcdd-d57523ad753b'::uuid
        or id in (400720, 3215046)
""")

┌──────────────────────────────────────┬─────────┬─────────────┐
│               cluster                │   id    │ probability │
│                 uuid                 │  int64  │    int32    │
├──────────────────────────────────────┼─────────┼─────────────┤
│ bd317632-4db6-40d7-9aff-6534a52a9781 │ 3215046 │           1 │
│ 8d7055f5-c76b-41bc-8570-33a5b445f56e │  400720 │           1 │
│ 2a6ce040-2860-4893-9f36-6bffcbaecf28 │ 3215046 │           1 │
│ a1a026d2-c698-45ec-ad6e-893c6d811328 │  400720 │           1 │
└──────────────────────────────────────┴─────────┴─────────────┘

In [10]:
duckdb.sql(
    f"""
    select distinct on ({unique_fields})
        {id},
        {unique_fields}
    from
        exp_sample_cleaned
    order by
        {unique_fields};
"""
)

┌─────────┬──────────────────────────────┬──────────┐
│   id    │         company_name         │ postcode │
│  int64  │           varchar            │ varchar  │
├─────────┼──────────────────────────────┼──────────┤
│ 2230549 │ 01direct                     │ NW       │
│  600690 │ 2028 w                       │ S        │
│ 3348100 │ 222 sports                   │ BA       │
│ 2585664 │ 3 arrows recycling solutions │ DA       │
│ 3481086 │ 313 pharma                   │ B        │
│  795796 │ 365 itms                     │ RG       │
│  330630 │ 4d modelshop                 │ E        │
│   71869 │ 4px fulfillment              │ UB       │
│  860104 │ 4sight imaging               │ BL       │
│  757287 │ a a k butterworth            │ BD       │
│     ·   │  ·                           │ ·        │
│     ·   │  ·                           │ ·        │
│     ·   │  ·                           │ ·        │
│ 2069120 │ yuzu                         │ SE       │
│ 2300862 │ zellis holdings 

In [25]:
exp_naive_deduper = make_deduper(
    dedupe_run_name="basic_hmrc_exp",
    description="""
        Clean company name, extract postcode area
    """,
    deduper=Naive,
    data=exp_sample_cleaned,
    data_source='hmrc.trade__exporters',
    dedupe_settings={
        "id": "id",
        "unique_fields": [
            "company_name",
            "postcode"
        ]
    }
)

exp_deduped = exp_naive_deduper()

In [29]:
exp_deduped.to_cmf(
    cmf_conn=cmf_conn,
    overwrite=True
)

NameError: name 'pd' is not defined

In [3]:
from cmf.data.probabilities import ProbabilityResults
import pandas as pd
import uuid

df_prob = pd.DataFrame(
    {
        "probability": [0.1, 0.8, 0.99]
    }
)

df_prob["id"] = [uuid.uuid4() for _ in range(len(df_prob.index))]
df_prob["cluster"] = [uuid.uuid4() for _ in range(len(df_prob.index))]
df_prob["source"] = 1

results = ProbabilityResults(
    dataframe=df_prob,
    run_name="foo",
    description="bar"
)

results.to_df()

Unnamed: 0,probability,id,cluster,source
0,0.1,012d5a1e-d47f-49cb-a1e1-4c71b72b5fbb,bbdd40eb-898e-438e-b964-53c798cdbf5e,1
1,0.8,97bc93c4-03c5-4f7c-8fc3-7e3fbaae53e2,a921959f-28c9-4380-a7b0-c82636655579,1
2,0.99,17faf79e-f9d4-439a-b2c2-6fb7da6c5be5,a7dad00c-89b3-471b-8e15-9d77e82f0da1,1


In [2]:
from cmf.data import Probabilities, Table

prob = Probabilities(
    db_table=Table(
        db_schema=os.getenv("SCHEMA"), 
        db_table=os.getenv("PROBABILITIES_TABLE")
    )
)

NameError: name 'CMFDB' is not defined

In [38]:
results.to_cmf()

In [39]:
prob.db_table.read()

Unnamed: 0,uuid,link_type,model,source,cluster,id,probability
0,623adca9-1dd2-4db7-8b99-26d842b8a27d,link,foo,1,8229d1b6-5486-4dd5-ae92-d9ae3be9d7fc,aec1148c-39ee-4d6e-a7a5-ebfe36b7104f,0.1
1,93b7ffe9-01a8-4321-b0f0-e25fcd46ad37,link,foo,1,3e5a0820-6736-4da5-afa7-488f86500759,f93f530b-cdee-47d7-bb6b-a906aef18c6b,0.8
2,6e5ba1b4-d44b-483e-a433-a14aa2e06482,link,foo,1,b7b72197-c2ec-43e3-aa15-232eafb45d83,3eda53ac-f501-4f47-afa2-79b7762effff,0.99
