In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [13]:
import cmf
from cmf import clean
from cmf.clean import steps
from cmf.data.results import ClusterResults, ProbabilityResults
from cmf.data.utils import sqa_profiled
from cmf.dedupers import NaiveDeduper
from cmf.helpers import cleaner, cleaners, selector

from pandas import DataFrame
import logging

db_logger = logging.getLogger("sqlalchemy.engine")
db_logger.setLevel(logging.INFO)
db_logger_fh = logging.FileHandler("logging/sqlalchemy.log")
db_logger_fh.setLevel(logging.INFO)
db_logger.addHandler(db_logger_fh)

logic_logger = logging.getLogger("cmf_logic")
logic_logger.setLevel(logging.INFO)
logic_logger_fh = logging.FileHandler("logging/cmf.log")
logic_logger_fh.setLevel(logging.INFO)
logic_logger.addHandler(logic_logger_fh)

## First model

Data already in DB.

In [3]:
_NAME = "naive_export_wins_v1"
_SOURCE = "dbt.export_wins__wins_dataset"

In [4]:
ew_selector = selector(
    table=_SOURCE,
    fields=["company_name", "cdms_reference"],
)

ew_raw = cmf.query(selector=ew_selector, return_type="pandas", limit=1_000)

In [5]:
clean_generic_id = clean.cleaning_function(
    steps.punctuation_to_spaces, steps.to_upper, steps.remove_whitespace
)

clean_ew = cleaners(
    cleaner(
        clean.company_name, {"column": "dbt_export_wins__wins_dataset_company_name"}
    ),
    cleaner(
        clean_generic_id, {"column": "dbt_export_wins__wins_dataset_cdms_reference"}
    ),
)

ew_clean = cmf.process(ew_raw, clean_ew)

In [6]:
ew_naive_deduper = cmf.make_deduper(
    dedupe_run_name=_NAME,
    description="Basic cleaning of name and CDMS column.",
    deduper=NaiveDeduper,
    deduper_settings={
        "id": "data_sha1",
        "unique_fields": [
            "dbt_export_wins__wins_dataset_company_name",
            "dbt_export_wins__wins_dataset_cdms_reference",
        ],
    },
    data=ew_clean,
    data_source=_SOURCE,
)

ew_deduped = ew_naive_deduper()

In [7]:
ew_deduped.to_df().head(3)
ew_deduped.to_df().info()

Unnamed: 0,model,left,left_id,right,right_id,probability
0,naive_export_wins_v1,dbt.export_wins__wins_dataset,b'\x04\xa3}_\xe6\xdb\xa0mK\x98\xf6\x8b\xba\xaa...,dbt.export_wins__wins_dataset,b'\xb0\xfc\x01\x9b \xc0tx\xcd\xe4g\xc9\x82\x86...,1
1,naive_export_wins_v1,dbt.export_wins__wins_dataset,b'}\xcd\xb5\xbbt\xb2d\xae>D\xe8\x12\x02@i\xd2\...,dbt.export_wins__wins_dataset,b'\xee\xfd\xe9\xb3\xad^XA\xf3\xd3\xd6l\xcfb6{\...,1
2,naive_export_wins_v1,dbt.export_wins__wins_dataset,b'\xb4k:#\\@\x7f~v\xac\xdds\xec\xb3/\xcd\xd4.\...,dbt.export_wins__wins_dataset,b'M6\x12+H\x808\xc7O*\xec{\xa1o\xb1#\x19=\x16:',1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype          
---  ------       --------------  -----          
 0   model        1721 non-null   string[pyarrow]
 1   left         1721 non-null   string[pyarrow]
 2   left_id      1721 non-null   object         
 3   right        1721 non-null   string[pyarrow]
 4   right_id     1721 non-null   object         
 5   probability  1721 non-null   int32[pyarrow] 
dtypes: int32[pyarrow](1), object(2), string[pyarrow](3)
memory usage: 185.2+ KB


In [8]:
ew_clusters = cmf.to_clusters(
    ew_clean, 
    results=ew_deduped, 
    key="data_sha1", 
    threshold=1
)

In [9]:
ew_clusters.to_df().head(3)
ew_clusters.to_df().info()

Unnamed: 0,parent,child
0,b'\r!\xe9\xe4q\xc8\x11\xe6\x96!\xe1O\x1b\xf0\x...,b'\x04\xa3}_\xe6\xdb\xa0mK\x98\xf6\x8b\xba\xaa...
1,b'\r!\xe9\xe4q\xc8\x11\xe6\x96!\xe1O\x1b\xf0\x...,b'\xb0\xfc\x01\x9b \xc0tx\xcd\xe4g\xc9\x82\x86...
2,b'\xde\xd9>\xf4!\x1e\xe7t\xa1\x90\x05\x9fS\x91...,b'}\xcd\xb5\xbbt\xb2d\xae>D\xe8\x12\x02@i\xd2\...


<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 495
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   parent  1000 non-null   object
 1   child   1000 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


In [18]:
with sqa_profiled():
    ew_deduped.to_cmf()

KeyboardInterrupt: 

In [34]:
with sqa_profiled():
    ew_clusters.to_cmf()

         175072 function calls (172334 primitive calls) in 0.464 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.464    0.464 /home/jovyan/company-matching/cmf/data/results.py:121(to_cmf)
        1    0.000    0.000    0.455    0.455 /home/jovyan/company-matching/cmf/data/results.py:508(_deduper_to_cmf)
        1    0.000    0.000    0.454    0.454 /home/jovyan/company-matching/cmf/data/results.py:439(_to_cmf_logic)
    63/15    0.000    0.000    0.244    0.016 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/state_changes.py:95(_go)
        4    0.000    0.000    0.242    0.061 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/session.py:1933(commit)
      6/4    0.000    0.000    0.242    0.061 <string>:1(commit)
      6/4    0.000    0.000    0.242    0.061 /opt/conda/envs/company_matching/lib/python3.9/site-packages/sqlalchemy/orm/se

## Second model

Brand new data.

In [4]:
_NAME = "naive_export_wins_v2"
_SOURCE = "dbt.export_wins__wins_dataset"

In [5]:
ew_selector = selector(
    table=_SOURCE,
    fields=["company_name", "cdms_reference"],
)

ew_raw = cmf.query(selector=ew_selector, return_type="pandas", limit=1_000)

In [6]:
clean_generic_id = clean.cleaning_function(
    steps.punctuation_to_spaces, steps.to_upper, steps.remove_whitespace
)

clean_ew = cleaners(
    cleaner(
        clean.company_name, {"column": "dbt_export_wins__wins_dataset_company_name"}
    ),
    cleaner(
        clean_generic_id, {"column": "dbt_export_wins__wins_dataset_cdms_reference"}
    ),
)

ew_clean = cmf.process(ew_raw, clean_ew)

In [7]:
ew_naive_deduper = cmf.make_deduper(
    dedupe_run_name=_NAME,
    description="Basic cleaning of name and CDMS column.",
    deduper=NaiveDeduper,
    deduper_settings={
        "id": "data_sha1",
        "unique_fields": [
            "dbt_export_wins__wins_dataset_company_name",
            "dbt_export_wins__wins_dataset_cdms_reference",
        ],
    },
    data=ew_clean,
    data_source=_SOURCE,
)

ew_deduped = ew_naive_deduper()

In [8]:
ew_clusters = cmf.to_clusters(
    ew_clean, 
    results=ew_deduped, 
    key="data_sha1", 
    threshold=1
)

In [9]:
ew_clusters.to_df().head(3)
ew_clusters.to_df().info()

Unnamed: 0,parent,child
0,b'r\x15\xa1\xb5G\x8f)\xce\xc4\x90\x99\xcb\x98i...,b'\x1d\x80\xb3\xbd\x8ar\xf57QE\xc6\x9a}\xd0\xc...
1,b'r\x15\xa1\xb5G\x8f)\xce\xc4\x90\x99\xcb\x98i...,b'\xa7\x04:\xc1\xbd\xf74h\\\x93G>)\x81\xd69\xe...
2,b' ^\x9f\xe9~+*l\xc2\xe2C\x1c\xdb!ENx\xb5\xb8\...,b'+\xf6\xca\x88\xac\x83JQ\x8c\xb85\x837\x13\x0...


<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 495
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   parent  1000 non-null   object
 1   child   1000 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


In [14]:
ew_deduped.to_cmf()

AssertionError: Dependency rule on column 'cmf__ddupes.sha1' tried to blank-out primary key column 'cmf__ddupe_probabilities.ddupe' on instance '<DDupeProbabilities at 0x7f2485590ca0>'

In [None]:
ew_clusters.to_cmf()

In [1]:
from cmf.data import Models, ENGINE, DDupeProbabilities
from sqlalchemy.orm import Session
from sqlalchemy import delete

In [2]:
with Session(ENGINE) as session:
    model = session.query(Models).first()
    subq = (
        model
        .proposes_dedupes
        .select()
        .with_only_columns(DDupeProbabilities.model)
    )
    session.execute(
        delete(DDupeProbabilities)
        .where(DDupeProbabilities.model.in_(subq))
    )

AttributeError: '_AssociationDict' object has no attribute 'select'

In [None]:
with Session(ENGINE) as session: