# Imports

In [2]:
from postal.parser import parse_address
import json
from duckdb.sqltypes import VARCHAR
import pandas as pd
import re
from unidecode import unidecode
import sys
import os
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../merge_tables"))
from merge_tables.db.connection import connect_to_postgres_via_duckdb
from merge_tables.db.tables import create_clean_account_name_macro

# duck init

In [3]:
duck = connect_to_postgres_via_duckdb()
create_clean_account_name_macro(duck)

✓ Successfully connected DuckDB to PostgreSQL database 'medisoft'
✓ Created clean_account_name macro


In [4]:
def udf_parse_address(address: str) -> str:
    parsed = parse_address(address)
    components = {component: value for value, component in parsed}
    return json.dumps(components)

In [5]:
def udf_clean_german_road(text):
    """Clean German road names for matching."""
    if pd.isna(text) or text == '':
        return None
    text = text.lower().strip()
    text = text.replace('ß', 'ss')
    text = re.sub(r'(str\.|str$|str\s|straße|strasse)', '', text)
    text = unidecode(text)
    text = re.sub(r'[^a-z0-9]', '', text)
    return text


In [None]:
duck.create_function("udf_parse_address", udf_parse_address, [VARCHAR], VARCHAR)
duck.create_function("udf_clean_german_road", udf_clean_german_road, [VARCHAR], VARCHAR)

<_duckdb.DuckDBPyConnection at 0x10ddde5f0>

# name matching

## easybill rows

In [None]:
duck.sql("""
         select distinct on("Kontakt: Kundennummer") * from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv') 
         """)

┌─────────────────────┬───────────────────────┬────────────────────────────┬────────────────────┬─────────────────┬─────────────────────────────────┬────────────────┬──────────────────┬───────────────┬───────────────────────┬─────────────────────────────────────────────────────────────────┬────────────────────────────┬───────────────────────┬───────────────────┬─────────────────────┬───────────────┬──────────────────────┬───────────────────┬────────────────────────────────┬───────────────────────┬──────────────────────────────┬────────────────────────┬───────────────────────────────┬────────────────────┬────────────────────┬──────────────┬───────────────────────┬──────────────────────────────────────────┬───────────────────────────────────────────┬───────────────────┬─────────────────────────────────────────────────────────────────┬─────────────────┬─────────────────────┬──────────────────────────────┬──────────────────────────┬─────────────────────────┬───────────────────────┬────────

## eb / zoho name matched

In [10]:
duck.sql(
    """
    with clean_zoho as (
        select 
            * replace(
                trim(unnest(split(Account_Name, '/'))) as Account_Name
            ),
            clean_account_name(trim(unnest(split(Account_Name, '/')))) as clean_name
        from pg.zoho.Accounts
    ), clean_easybill as (
        select 
            * replace(
                trim(unnest(split("Kontakt: Firma", '/'))) as "Kontakt: Firma"
            ),
            clean_account_name(trim(unnest(split("Kontakt: Firma", '/')))) as clean_name
        from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv')
    )
    select 
        z.Id as zoho_id,
        easybill."Kontakt: Kundennummer" as easybill_customer_number,
        easybill."Kontakt: Firma" as easybill_name,
        easybill.clean_name as clean_easybill_name,
        z.Account_Name as zoho_name,
        z.clean_name as clean_zoho_name,
        jaro_winkler_similarity(clean_account_name(easybill."Kontakt: Firma"), z.clean_name) as sim

    from clean_easybill as easybill

    join (select * from clean_zoho where Account_Name <> '' and length(Account_Name) > 1 and length(clean_name) > 3 and clean_name <> 'test') z
        on jaro_winkler_similarity(clean_account_name(easybill."Kontakt: Firma"), z.clean_name) > 0.95
        or (easybill.clean_name in z.clean_name or z.clean_name in easybill.clean_name and jaro_winkler_similarity(clean_account_name(easybill."Kontakt: Firma"), z.clean_name) > 0.6)
    QUALIFY row_number() OVER (PARTITION BY easybill."Kontakt: Kundennummer" ORDER BY sim DESC) = 1
    """
).to_csv('output/easybill_zoho_name_matched.csv')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## not matching rows

# address and name matching

## eb/zoho matched rows

In [168]:
duck.sql(
    """
    with zoho_raw_add as (
        select 
            Id, 
            Account_Name, 
            Billing_Street || ' ' || Billing_Code || ', Deutshcland' as full_billing_address, 
            Shipping_Street || ' ' || Shipping_Code || ', Deutshcland' as full_shipping_address
        from pg.zoho.Accounts 
        offset 1
    ), zoho_dist_add as (
        select
            Id,
            Account_Name,
            list_distinct([full_billing_address, full_shipping_address]) as address
        from zoho_raw_add
        group by all
        order by Id
    ), zoho_all_raw_add as (
        select
            z.Id,
            z.Account_Name,
            t.raw_address,
        from zoho_dist_add z
        left join lateral unnest(z.address) as t(raw_address) on true
    ), zoho_all_parsed_add as (
        select
            z.*,
            parsed_address->'house_number' as house_number,
            parsed_address->'road' as road,
            udf_clean_german_road(parsed_address->'road') as clean_road,
            parsed_address->'postcode' as postcode,
            parsed_address->'country' as country,
        from (select Id, Account_Name as zoho_name, udf_parse_address(raw_address)::json as parsed_address from zoho_all_raw_add) as z
    ), easybill_raw_add as (
        select distinct on("Kontakt: Kundennummer")
            "Kontakt: Kundennummer" as eb_id,
            "Kontakt: Firma" as eb_name,
            "Kontakt: Straße/Hausnummer" as eb_nb_street,
            "Kontakt: Postleitzahl" as eb_postcode,
            *
        from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv')
    ), easybill_all_parsed_add as (
        select
            eb_id,
            eb_name,
            parsed_address->'house_number' as house_number,
            parsed_address->'road' as road,
            udf_clean_german_road(parsed_address->'road') as clean_road,
            parsed_address->'postcode' as postcode,
            parsed_address->'country' as country,
        from (select *, udf_parse_address(eb_nb_street || ' ' || eb_postcode || ', Deutschland')::json as parsed_address from easybill_raw_add) as e
    )
    select *,
        jaro_winkler_similarity(clean_account_name(zoho_name), clean_account_name(eb_name)) as name_sim,
    from easybill_all_parsed_add
    join zoho_all_parsed_add
        on (jaro_winkler_similarity(easybill_all_parsed_add.clean_road, zoho_all_parsed_add.clean_road) > 0.95
        and easybill_all_parsed_add.postcode = zoho_all_parsed_add.postcode
        and jaro_winkler_similarity(easybill_all_parsed_add.house_number, zoho_all_parsed_add.house_number) > 0.95)       
    --where Id = '386758000035779091'
    --where Id IS NULL
    qualify row_number() over (partition by eb_id order by name_sim desc) = 1
    order by eb_id
    """).to_csv('output/easybill_zoho_address_and_name_matched.csv')

## eb/zoho unmatched rows

In [177]:
duck.sql(
    """
    with zoho_raw_add as (
        select 
            Id, 
            Account_Name, 
            Billing_Street || ' ' || Billing_Code || ', Deutshcland' as full_billing_address, 
            Shipping_Street || ' ' || Shipping_Code || ', Deutshcland' as full_shipping_address
        from pg.zoho.Accounts 
        offset 1
    ), zoho_dist_add as (
        select
            Id,
            Account_Name,
            list_distinct([full_billing_address, full_shipping_address]) as address
        from zoho_raw_add
        group by all
        order by Id
    ), zoho_all_raw_add as (
        select
            z.Id,
            z.Account_Name,
            t.raw_address,
        from zoho_dist_add z
        left join lateral unnest(z.address) as t(raw_address) on true
    ), zoho_all_parsed_add as (
        select
            z.*,
            parsed_address->'house_number' as house_number,
            parsed_address->'road' as road,
            udf_clean_german_road(parsed_address->'road') as clean_road,
            parsed_address->'postcode' as postcode,
            parsed_address->'country' as country,
        from (select Id, Account_Name as zoho_name, udf_parse_address(raw_address)::json as parsed_address from zoho_all_raw_add) as z
    ), easybill_raw_add as (
        select distinct on("Kontakt: Kundennummer")
            "Kontakt: Kundennummer" as eb_id,
            "Kontakt: Firma" as eb_name,
            "Kontakt: Straße/Hausnummer" as eb_nb_street,
            "Kontakt: Postleitzahl" as eb_postcode,
            *
        from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv')
    ), easybill_all_parsed_add as (
        select
            eb_id,
            eb_name,
            parsed_address->'house_number' as house_number,
            parsed_address->'road' as road,
            udf_clean_german_road(parsed_address->'road') as clean_road,
            parsed_address->'postcode' as postcode,
            parsed_address->'country' as country,
        from (select *, udf_parse_address(eb_nb_street || ' ' || eb_postcode || ', Deutschland')::json as parsed_address from easybill_raw_add) as e
    )
    select *,
        jaro_winkler_similarity(clean_account_name(zoho_name), clean_account_name(eb_name)) as name_sim,
    from easybill_all_parsed_add
    left join zoho_all_parsed_add
        on (jaro_winkler_similarity(easybill_all_parsed_add.clean_road, zoho_all_parsed_add.clean_road) > 0.95
        and easybill_all_parsed_add.postcode = zoho_all_parsed_add.postcode
        and jaro_winkler_similarity(easybill_all_parsed_add.house_number, zoho_all_parsed_add.house_number) > 0.95)       
    --where Id = '386758000035779091'
    where Id IS NULL
    qualify row_number() over (partition by eb_id order by name_sim desc) = 1
    order by eb_id
    """)
#.to_csv('output/easybill_zoho_address_and_name_matched.csv')

┌───────────┬─────────────────────────────────────────────────────────────────────────────────────┬──────────────┬─────────────────────────────────┬─────────────────────────────┬──────────┬───────────────┬─────────┬───────────┬────────────────┬──────────────┬──────┬────────────┬──────────┬─────────┬──────────┐
│   eb_id   │                                       eb_name                                       │ house_number │              road               │         clean_road          │ postcode │    country    │   Id    │ zoho_name │ parsed_address │ house_number │ road │ clean_road │ postcode │ country │ name_sim │
│   int64   │                                       varchar                                       │     json     │              json               │           varchar           │   json   │     json      │ varchar │  varchar  │      json      │     json     │ json │  varchar   │   json   │  json   │  double  │
├───────────┼───────────────────────────────────────────────────

# merge both matchings

In [40]:
duck.sql(
    """
with name_addr_matches as (
    select 
        'addr_name_'||row_number() over (order by eb_id) as row_num,
        eb_id as easybill_id,
        eb_name as easybill_name,
        Id as zoho_id,
        zoho_name,
        name_sim as sim
    from read_csv('output/easybill_zoho_address_and_name_matched.csv')
    ), name_only_matches as (
        select 
        'name_'||row_number() over (order by easybill_customer_number) as row_num,
        easybill_customer_number as easybill_id,
        easybill_name,
        zoho_id,
        zoho_name,
        sim
        
    from read_csv('output/easybill_zoho_name_matched.csv')
    ), all_matches as (
        select * from name_addr_matches
        union all
        select * from name_only_matches
    )
    select *
    from all_matches
    qualify row_number() over (partition by easybill_id order by sim desc) = 1
    """
).to_csv('output/easybill_zoho_final_matches.csv')

## still no matching

In [59]:
duck.sql(
    """
    select 
        distinct on ("Kontakt: Kundennummer") *  
    from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv') 
    where "Kontakt: Kundennummer" not in (
        select easybill_id from read_csv('output/easybill_zoho_final_matches.csv')
        )
    """
).to_csv('output/easybill_zoho_final_unmatched.csv')

In [51]:
duck.sql("select distinct \"Kontakt: Kundennummer\" from read_csv('/Users/adrienblanquer/Downloads/easybill_contacts_inv_rec_activ.csv')")

┌───────────────────────┐
│ Kontakt: Kundennummer │
│         int64         │
├───────────────────────┤
│             130000425 │
│             130000185 │
│             105010055 │
│             130000858 │
│             130000877 │
│             130000882 │
│             130000886 │
│             130000888 │
│             130000920 │
│             130000925 │
│                 ·     │
│                 ·     │
│                 ·     │
│             130001824 │
│             130001844 │
│             130001845 │
│             130001892 │
│             130002045 │
│             130002102 │
│             130002105 │
│             130002141 │
│             130002176 │
│             130002320 │
├───────────────────────┤
│  797 rows (20 shown)  │
└───────────────────────┘

In [56]:
duck.sql("""
        select easybill_id from read_csv('output/easybill_zoho_final_matches.csv')

         """)

┌─────────────┐
│ easybill_id │
│    int64    │
├─────────────┤
│   101000015 │
│   102000005 │
│   102000006 │
│   108040029 │
│   108040050 │
│   110010000 │
│   111000023 │
│   111000050 │
│   111001011 │
│   113000000 │
│       ·     │
│       ·     │
│       ·     │
│   130001760 │
│   130001819 │
│   130001971 │
│   130001978 │
│   130002145 │
│   130002306 │
│   130002307 │
│   130002314 │
│   130002329 │
│   130002333 │
├─────────────┤
│  777 rows   │
│ (20 shown)  │
└─────────────┘