# Initialization

In [1]:
print("Hello, world!")

Hello, world!


In [2]:
import duckdb

In [3]:
duck = duckdb.connect()

In [4]:
duck.execute("load postgres")

<_duckdb.DuckDBPyConnection at 0x10d9f8cf0>

In [5]:
import os
from dotenv import load_dotenv

# Load environment variables from ../.env
load_dotenv("../.env")

duck.execute(
    f"""
    create or replace secret (
        type postgres,
        host '{os.environ.get('DB_HOST', 'localhost')}',
        port {os.environ.get('DB_PORT', 5432)},
        database '{os.environ.get('DB_NAME', 'postgres')}',
        user '{os.environ.get('DB_USER', 'postgres')}',
        password '{os.environ.get('DB_PASSWORD', 'postgres')}'
        );
    
    attach '' as pg (type postgres);
    """
)

<_duckdb.DuckDBPyConnection at 0x10d9f8cf0>

In [6]:
duck.sql(
    """
    select count(*) from pg.medisoft.table_firmenstruktur

    union all

    select count(*) from pg.zoho.Accounts 
    """
)

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         2594 │
│        32389 │
└──────────────┘

In [7]:
duck.execute(
    """
    begin transaction;

    create or replace temp table medisoft_firms as
    select * replace (trim(name) as name, trim(kuerzel) as kuerzel) from pg.medisoft.table_firmenstruktur;

    create or replace temp table zoho_accounts as
    select Id, trim(Account_Name) as Account_Name from pg.zoho.Accounts;

    commit;
    """
)

<_duckdb.DuckDBPyConnection at 0x10d9f8cf0>

# Firm name match

In [8]:
duck.execute("""
CREATE OR REPLACE MACRO clean_account_name(str) AS (
  regexp_replace(
    regexp_replace(
      lower(strip_accents(replace(str, '&#38;', ''))), 
      '\b(llc|inc|ltd|corp|corporation|plc|gmbh)\b', 
      '', 'g'
    ), 
    '[^a-z0-9]', 
    '', 'g'
  )
);

""")

<_duckdb.DuckDBPyConnection at 0x10d9f8cf0>

In [9]:
duck.execute("""
create or replace temp table text_matched as
with medisoft_cleaned as (
  select 
    rec_id,
    clean_account_name(coalesce(name, kuerzel)) as clean_name
  from medisoft_firms
), zoho_cleaned as (
  select
    Id,
    clean_account_name(Account_Name) as clean_name
  from pg.zoho.Accounts
)
  select m.clean_name as mc, z.clean_name as zc, jaro_winkler_similarity(m.clean_name, z.clean_name) as sim, rec_id, Id
  from medisoft_cleaned as m
    inner join zoho_cleaned as z
    on (m.clean_name = z.clean_name or jaro_winkler_similarity(m.clean_name, z.clean_name) > 0.95)
  QUALIFY row_number() OVER (PARTITION BY m.rec_id ORDER BY sim DESC) = 1
order by sim

""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x10d9f8cf0>

In [10]:
duck.sql("select * from text_matched")

┌────────────────────────────────────────────┬──────────────────────────────────────────────┬────────────────────┬──────────────────────────────────────┬────────────────────┐
│                     mc                     │                      zc                      │        sim         │                rec_id                │         Id         │
│                  varchar                   │                   varchar                    │       double       │               varchar                │      varchar       │
├────────────────────────────────────────────┼──────────────────────────────────────────────┼────────────────────┼──────────────────────────────────────┼────────────────────┤
│ sbhschrottundbaustoffhandelsgesmbh         │ sbhschrottubaustoffhandelsgesmbh             │ 0.9507352941176471 │ 00_9GE00LWYBY                        │ 386758000032311049 │
│ teamenergiegmbhcokguelzen                  │ teamenergiegmbhcokg                          │              0.952 │ 00_8S9010B

In [None]:
duck.execute(
    """
    insert into pg.medisoft.table_firms_zoho (rec_id, id_zoho) (
select rec_id, Id 
from medisoft_firms
left join text_matched
using(rec_id)
)"""
)

In [None]:
print("Hello, world!")

# Summurization

In [None]:
duck.sql(
    """
    summarize select * from zoho_accounts
    """
).show(max_rows=1000)

In [None]:
duck.sql(
    """
    summarize select * from medisoft_firms
    """
).show(max_rows=1000)

# Sandbox

In [None]:
duck.execute("call start_ui();")

In [None]:
duck.sql("select * from pg.medisoft.table_firms_zoho")

In [11]:
from postal.parser import parse_address

In [12]:
med_df = duck.sql(
    """
    select rec_id, name, plz, strasse from medisoft_firms where rec_id not in (select rec_id from pg.medisoft.table_firms_zoho where id_zoho is not null)
    """
).df()

In [13]:
med_df.shape

(1449, 4)

In [14]:
med_clean_df = med_df.dropna(ignore_index=True)

In [15]:
med_clean_df['raw_address'] = med_clean_df['strasse'] + ", " + med_clean_df['plz'] + ", Deutschland"
med_clean_df['parsed_address'] = med_clean_df['raw_address'].dropna().apply(parse_address)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  med_clean_df['raw_address'] = med_clean_df['strasse'] + ", " + med_clean_df['plz'] + ", Deutschland"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  med_clean_df['parsed_address'] = med_clean_df['raw_address'].dropna().apply(parse_address)


In [16]:
parsed_addresses = med_clean_df['parsed_address']

In [17]:
parsed_addresses[0]

[('rudolf-breidtscheidstrasse', 'road'),
 ('185', 'house_number'),
 ('14482', 'postcode'),
 ('deutschland', 'country')]

In [18]:
import pandas as pd
import re
from unidecode import unidecode

In [19]:
med_clean_df['parsed_address'].apply(lambda x: {k: v for v, k in x}).apply(pd.Series)[['road', 'house_number', 'postcode']]

Unnamed: 0,road,house_number,postcode
0,rudolf-breidtscheidstrasse,185,14482
1,karl-marx-str.,255,12057
2,frankfurter allee,111,10247
3,schudomarstr.,16,12055
4,friedenfelser str.,11,12279
...,...,...,...
1297,schiesstraße,8-10,40549
1298,bützower str.,103,18236
1299,,,40229
1300,funkenburgstraße,1,04105


In [20]:
med_clean_df = pd.concat([
    med_clean_df, 
    med_clean_df['parsed_address'].apply(lambda x: {k: v for v, k in x}).apply(pd.Series)[['road', 'house_number', 'postcode']]
], axis=1)


In [21]:
med_clean_df['road']

0       rudolf-breidtscheidstrasse
1                   karl-marx-str.
2                frankfurter allee
3                    schudomarstr.
4               friedenfelser str.
                   ...            
1297                  schiesstraße
1298                 bützower str.
1299                           NaN
1300              funkenburgstraße
1301           brücklesäckerstraße
Name: road, Length: 1302, dtype: object

In [22]:
def clean_german_road(text):
    if pd.isna(text) or text == '':
        return None
    text = text.lower().strip()
    text = text.replace('ß', 'ss')
    text = re.sub(r'(str\.|str$|str\s|straße|strasse)', '', text)
    text = unidecode(text)
    text = re.sub(r'[^a-z0-9]', '', text)
    
    return text


In [23]:
med_clean_df['road_cleaned'] = med_clean_df['road'].apply(clean_german_road)

In [24]:
med_clean_df['road_cleaned'] 

0       rudolfbreidtscheid
1                 karlmarx
2         frankfurterallee
3                schudomar
4            friedenfelser
               ...        
1297                schies
1298              butzower
1299                  None
1300            funkenburg
1301         brucklesacker
Name: road_cleaned, Length: 1302, dtype: object

In [25]:
zoho_df = duck.sql(
    """
    select Id, Account_Name, Billing_Code, Billing_Street from pg.zoho.Accounts
    """
).df()

In [26]:
zoho_df.shape

(32389, 4)

In [27]:
zoho_df.dropna(ignore_index=True, inplace=True)
zoho_df['raw_address'] = zoho_df['Billing_Street'] + ", " + zoho_df['Billing_Code'] + ", Deutschland"
zoho_df['parsed_address'] = zoho_df['raw_address'].apply(parse_address)
parsed_addresses = zoho_df['parsed_address']

In [28]:
zoho_df = pd.concat([
    zoho_df, 
    zoho_df['parsed_address'].apply(lambda x: {k: v for v, k in x}).apply(pd.Series)[['road', 'house_number', 'postcode']]
], axis=1)

In [29]:
zoho_df['road_cleaned'] = zoho_df['road'].apply(clean_german_road)

In [30]:
med_clean_df

Unnamed: 0,rec_id,name,plz,strasse,raw_address,parsed_address,road,house_number,postcode,road_cleaned
0,838259F9-22F9-4D10-A6B4-1E2A7BD53670,Hasso Plattner Gesundheitscloud,14482,Rudolf-Breidtscheidstrasse 185,"Rudolf-Breidtscheidstrasse 185, 14482, Deutsch...","[(rudolf-breidtscheidstrasse, road), (185, hou...",rudolf-breidtscheidstrasse,185,14482,rudolfbreidtscheid
1,EA85044E-DDF0-4786-A8AC-565972D9FA4E,MoVe Bildungsakademie GmbH &#38; CoKG,12057,Karl-Marx-Str. 255,"Karl-Marx-Str. 255, 12057, Deutschland","[(karl-marx-str., road), (255, house_number), ...",karl-marx-str.,255,12057,karlmarx
2,3B4E9938-5AB6-4DA2-951D-59E30962CCB1,Lasermed Augenklinik im Ringcenter,10247,Frankfurter Allee 111,"Frankfurter Allee 111, 10247, Deutschland","[(frankfurter allee, road), (111, house_number...",frankfurter allee,111,10247,frankfurterallee
3,30EA28A1-75A6-4D57-AB3A-5F5C2A37914D,Seniorenheim St Richardt z.H Hr Stöppler,12055,Schudomarstr. 16,"Schudomarstr. 16, 12055, Deutschland","[(schudomarstr., road), (16, house_number), (1...",schudomarstr.,16,12055,schudomar
4,84A565FF-FAE8-4A07-8D53-63CFCF640135,Kinder in Marienfelde,12279,Friedenfelser Str. 11,"Friedenfelser Str. 11, 12279, Deutschland","[(friedenfelser str., road), (11, house_number...",friedenfelser str.,11,12279,friedenfelser
...,...,...,...,...,...,...,...,...,...,...
1297,00_A1O00PMTTP,Arztpraxis Dr. med. Lysson,40549,Schiesstraße 8-10,"Schiesstraße 8-10, 40549, Deutschland","[(schiesstraße, road), (8-10, house_number), (...",schiesstraße,8-10,40549,schies
1298,00_A1Q00Q3MF5,EGN Baustoffmarkt,18236,Bützower Str. 103,"Bützower Str. 103 , 18236, Deutschland","[(bützower str., road), (103, house_number), (...",bützower str.,103,18236,butzower
1299,00_A1Q00Q4XCG,Radiologie von Werder,40229,Gumbertstr.90-92,"Gumbertstr.90-92, 40229, Deutschland","[(gumbertstr.90-92, house), (40229, postcode),...",,,40229,
1300,00_A1R00J04JU,Zahnarztpraxis Thomas Wendt,04105,Funkenburgstraße 1,"Funkenburgstraße 1, 04105 , Deutschland","[(funkenburgstraße, road), (1, house_number), ...",funkenburgstraße,1,04105,funkenburg


In [31]:
med_clean_df.house_number

0        185
1        255
2        111
3         16
4         11
        ... 
1297    8-10
1298     103
1299     NaN
1300       1
1301      14
Name: house_number, Length: 1302, dtype: object

In [32]:
def split_and_clean_house_number(val):
    if pd.isna(val) or val == '':
        return pd.Series([None, None])
    
    # 1. Split the string if it contains '-' or '/'
    # This creates a list, e.g., "62-64" -> ["62", "64"]
    parts = re.split(r'[-/]', str(val))
    
    # 2. Define a helper to remove all non-numeric characters
    def keep_only_digits(s):
        cleaned = re.sub(r'\D', '', s) # \D matches any non-digit
        return cleaned if cleaned != '' else None

    # 3. Process the parts
    num_1 = keep_only_digits(parts[0])
    num_2 = None
    
    # If there was a separator, process the second part
    if len(parts) > 1:
        num_2 = keep_only_digits(parts[1])
        
    return pd.Series([num_1, num_2])

# Apply the function to create two new columns
med_clean_df[['house_num_1', 'house_num_2']] = med_clean_df['house_number'].apply(split_and_clean_house_number)


In [33]:
zoho_df[['house_num_1', 'house_num_2']] = zoho_df['house_number'].apply(split_and_clean_house_number)

In [33]:
duck.sql("select * from med_clean_df where house_num_2 is not null")

┌───────────────┬──────────────────────────────────────────────────┬─────────┬─────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────┬───────────────────┬──────────┬─────────────────────────────────┬─────────────┬─────────────┐
│    rec_id     │                       name                       │   plz   │                   strasse                   │                            raw_address                            │                                                  parsed_address                                                   │                road                 │   house_number    │ postcode │          road_cleaned           │ house_num_1 │ house_num_2 │
│    varchar    │                     varchar                      │ varchar │                   varchar                

In [48]:
duck.sql("select * from pg.medisoft.table_firmenstruktur where rec_id = '406E85E9-3D74-41D6-8346-651CA08A3150'")

┌──────────────────────────────────────┬──────────────────────────────────────┬──────────────────────────────────────────┬───────────────────────────────────────────────────┬─────────┬────────────────┬───────────────┬───────────────┬─────────────────┬───────────────┬───────────────┬───────────────┬────────────────┬──────────────────────────────────────────┬────────────┬─────────────────┬─────────┬─────────┬───────────┬────────────┬─────────────────┬──────────────────────────┬─────────┬─────────┬─────────┬───────────────┬─────────┬───────────────┬─────────┬──────────────────┬─────────┬────────────────────────┬─────────┬──────────┬─────────┬───────────────────┬────────────┬─────────────────────────┬────────────┬─────────────┬─────────────┬────────────────────┐
│                rec_id                │               kuerzel                │                   name                   │                       pfad                        │ passiv  │ abrechnung_art │ laptop_update │ laptop_delete

In [47]:
med_clean_df[med_clean_df['rec_id'] == '406E85E9-3D74-41D6-8346-651CA08A3150']

Unnamed: 0,rec_id,name,plz,strasse,raw_address,parsed_address,road,house_number,postcode,road_cleaned,house_num_1,house_num_2
5,406E85E9-3D74-41D6-8346-651CA08A3150,Herr Hendrik Krawinkel HitFox Group GmbH,10178,Rosa-Luxemburg-Strasse 2,"Rosa-Luxemburg-Strasse 2, 10178, Deutschland","[(rosa-luxemburg-strasse, road), (2, house_num...",rosa-luxemburg-strasse,2,10178,rosaluxemburg,2,


In [36]:
duck.sql("select clean_account_name(name) from med_clean_df where rec_id = '406E85E9-3D74-41D6-8346-651CA08A3150'")

┌─────────────────────────────────────┐
│     clean_account_name("name")      │
│               varchar               │
├─────────────────────────────────────┤
│ herrhendrikkrawinkelhitfoxgroupgmbh │
└─────────────────────────────────────┘

In [37]:
zoho_df[zoho_df['Id'] == '386758000009959846']

Unnamed: 0,Id,Account_Name,Billing_Code,Billing_Street,raw_address,parsed_address,road,house_number,postcode,road_cleaned,house_num_1,house_num_2
31,386758000009959846,Hitfox,10178,Rosa-Luxemburg-Straße 2,"Rosa-Luxemburg-Straße 2, 10178, Deutschland","[(rosa-luxemburg-straße, road), (2, house_numb...",rosa-luxemburg-straße,2,10178,rosaluxemburg,2,


In [38]:
duck.sql("select clean_account_name(Account_Name) from zoho_df where Id = '386758000009959846'")

┌──────────────────────────────────┐
│ clean_account_name(Account_Name) │
│             varchar              │
├──────────────────────────────────┤
│ hitfox                           │
└──────────────────────────────────┘

In [51]:
zoho_df.road_cleaned

0          billingaddress
1             schonholzer
2                    None
3          karlliebknecht
4        landsbergerallee
               ...       
25946              4stock
25947              marien
25948          gotzkowsky
25949                 ost
25950            holzdamm
Name: road_cleaned, Length: 25951, dtype: object

In [54]:
address_match_df = duck.sql("""
select 
    name,
    Account_Name,
    jaro_winkler_similarity(clean_account_name(name), clean_account_name(Account_Name)) as similarity, 
    med_clean_df.* exclude (name),
    zoho_df.* exclude (Account_Name),
from med_clean_df
join zoho_df
on med_clean_df.road_cleaned = zoho_df.road_cleaned
and med_clean_df.plz = zoho_df.postcode
and (med_clean_df.house_number = zoho_df.house_number
    or med_clean_df.house_num_1 = zoho_df.house_num_1
    or med_clean_df.house_num_2 = zoho_df.house_num_2)
where (similarity > 0.7 or (clean_account_name(name) in clean_account_name(Account_Name) or clean_account_name(Account_Name) in clean_account_name(name)))
QUALIFY row_number() OVER (
    PARTITION BY med_clean_df.rec_id  -- On groupe par ligne Medisoft
    ORDER BY similarity DESC          -- On trie par la meilleure similarité
) = 1
order by similarity desc
""").df()

In [55]:
address_match_df

Unnamed: 0,name,Account_Name,similarity,rec_id,plz,strasse,raw_address,parsed_address,road,house_number,...,Billing_Code,Billing_Street,raw_address_1,parsed_address_1,road_1,house_number_1,postcode_1,road_cleaned_1,house_num_1_1,house_num_2_1
0,ESMT,ESMT European School of Management and Technol...,0.695035,00_98500R7OO2,10178,Schloßplatz 1,"Schloßplatz 1, 10178, Deutschland","[[schloßplatz, road], [1, house_number], [1017...",schloßplatz,1,...,10178,Schloßplatz 1,"Schloßplatz 1, 10178, Deutschland","[[schloßplatz, road], [1, house_number], [1017...",schloßplatz,1,10178,schlossplatz,1,
1,Notübernachtung für Wohnungslose,EJF gemeinnützige GmbH Notübernachtung für Woh...,0.694636,00_8R400J6MEK,13503,Am Bärensprung 52,"Am Bärensprung 52, 13503, Deutschland","[[am bärensprung, road], [52, house_number], [...",am bärensprung,52,...,13503,Am Bärensprung 52,"Am Bärensprung 52, 13503, Deutschland","[[am bärensprung, road], [52, house_number], [...",am bärensprung,52,13503,ambarensprung,52,
2,EJF,EJF gemeinnützige AG - Kindergarten Waldhaus,0.692982,00_8QP00MB160,14469,Amundsenstraße 24a,"Amundsenstraße 24a, 14469, Deutschland","[[amundsenstraße, road], [24a, house_number], ...",amundsenstraße,24a,...,14469,Amundsenstr. 24 a,"Amundsenstr. 24 a, 14469, Deutschland","[[amundsenstr., road], [24 a, house_number], [...",amundsenstr.,24 a,14469,amundsen,24,
3,EJF,EJF gemeinnützige AG - Integrations-Kindertage...,0.684211,00_8QP00L4N7V,14471,Knobelsdorffstraße 6-8,"Knobelsdorffstraße 6-8, 14471, Deutschland","[[knobelsdorffstraße, road], [6-8, house_numbe...",knobelsdorffstraße,6-8,...,14471,Knobelsdorffstr 6-8,"Knobelsdorffstr 6-8, 14471, Deutschland","[[knobelsdorffstr, road], [6-8, house_number],...",knobelsdorffstr,6-8,14471,knobelsdorff,6,8.0
4,Tagespflege Tegeler See,EJF Diakonie-Pflege gGmbH - Tagespflege Tegele...,0.66599,60_8MZ00SBF74,13507,Eisenhammerweg 10,"Eisenhammerweg 10, 13507, Deutschland","[[eisenhammerweg, road], [10, house_number], [...",eisenhammerweg,10,...,13507,Eisenhammerweg 10,"Eisenhammerweg 10, 13507, Deutschland","[[eisenhammerweg, road], [10, house_number], [...",eisenhammerweg,10,13507,eisenhammerweg,10,
5,Janusz Korczak Haus,EJF gemeinnützige AG - Dr. Janusz Korczak-Haus...,0.649979,00_8KE00RENQD,10319,Erich-Kurz-Str. 4a,"Erich-Kurz-Str. 4a, 10319, Deutschland","[[erich-kurz-str., road], [4a, house_number], ...",erich-kurz-str.,4a,...,10319,Erich-Kurz-Straße 4a,"Erich-Kurz-Straße 4a, 10319, Deutschland","[[erich-kurz-straße, road], [4a, house_number]...",erich-kurz-straße,4a,10319,erichkurz,4,
6,Dr. Dieter Grüttner,"Gemeinschaftspraxis Dr. Dieter Grüttner, Dr. B...",0.622475,00_8TQ00JOXQF,42799,Am Wallgraben 1,"Am Wallgraben 1, 42799, Deutschland","[[am wallgraben, road], [1, house_number], [42...",am wallgraben,1,...,42799,Am Wallgraben 1,"Am Wallgraben 1, 42799, Deutschland","[[am wallgraben, road], [1, house_number], [42...",am wallgraben,1,42799,amwallgraben,1,
7,MKM-Ihr Friseur,Maria Kristin Müller MKM- Ihr Friseur,0.612452,00_9X800SR1JU,51143,Schmittgasse 62,"Schmittgasse 62, 51143, Deutschland","[[schmittgasse, road], [62, house_number], [51...",schmittgasse,62,...,51143,Schmittgasse 62,"Schmittgasse 62, 51143, Deutschland","[[schmittgasse, road], [62, house_number], [51...",schmittgasse,62,51143,schmittgasse,62,
8,Einsteinkids,"EJF gemeinnützige AG - Kinderclub ""Einsteinkids""",0.594444,00_8UC00LLIXE,14471,Knobelsdorffstr. 7,"Knobelsdorffstr. 7, 14471, Deutschland","[[knobelsdorffstr., road], [7, house_number], ...",knobelsdorffstr.,7,...,14471,Knobelsdorffstraße 7,"Knobelsdorffstraße 7, 14471, Deutschland","[[knobelsdorffstraße, road], [7, house_number]...",knobelsdorffstraße,7,14471,knobelsdorff,7,
9,Junges Ensemble Stuttgart,Verein Kinder- und Jugendkultur / Junges Ensem...,0.584573,00_9JI00U03OU,70173,Eberhardstr. 61a,"Eberhardstr. 61a, 70173, Deutschland","[[eberhardstr., road], [61a, house_number], [7...",eberhardstr.,61a,...,70173,Eberhardstraße 61a,"Eberhardstraße 61a, 70173, Deutschland","[[eberhardstraße, road], [61a, house_number], ...",eberhardstraße,61a,70173,eberhard,61,


In [56]:
address_match_df[['Account_Name', 'name', 'similarity']]

Unnamed: 0,Account_Name,name,similarity
0,ESMT European School of Management and Technol...,ESMT,0.695035
1,EJF gemeinnützige GmbH Notübernachtung für Woh...,Notübernachtung für Wohnungslose,0.694636
2,EJF gemeinnützige AG - Kindergarten Waldhaus,EJF,0.692982
3,EJF gemeinnützige AG - Integrations-Kindertage...,EJF,0.684211
4,EJF Diakonie-Pflege gGmbH - Tagespflege Tegele...,Tagespflege Tegeler See,0.66599
5,EJF gemeinnützige AG - Dr. Janusz Korczak-Haus...,Janusz Korczak Haus,0.649979
6,"Gemeinschaftspraxis Dr. Dieter Grüttner, Dr. B...",Dr. Dieter Grüttner,0.622475
7,Maria Kristin Müller MKM- Ihr Friseur,MKM-Ihr Friseur,0.612452
8,"EJF gemeinnützige AG - Kinderclub ""Einsteinkids""",Einsteinkids,0.594444
9,Verein Kinder- und Jugendkultur / Junges Ensem...,Junges Ensemble Stuttgart,0.584573


In [60]:
duck.sql(
    """
    update pg.medisoft.table_firms_zoho
    set id_zoho = address_match_df.Id
    from address_match_df
    where pg.medisoft.table_firms_zoho.rec_id = address_match_df.rec_id
    """)

In [None]:
duck.sql("select * from address_match_df")

In [59]:
duck.sql("""
select * 
from pg.medisoft.table_firms_zoho   
where id_zoho is null
and rec_id in (select rec_id from address_match_df)
""")

┌───────┬──────────────────────────────────────┬─────────┐
│  id   │                rec_id                │ id_zoho │
│ int32 │               varchar                │ varchar │
├───────┼──────────────────────────────────────┼─────────┤
│ 11136 │ 406E85E9-3D74-41D6-8346-651CA08A3150 │ NULL    │
│ 11170 │ 00_8R400J6MEK                        │ NULL    │
│ 11178 │ 00_8QV00LF84G                        │ NULL    │
│ 11185 │ 00_8QP00MB160                        │ NULL    │
│ 11186 │ 00_8QP00L4N7V                        │ NULL    │
│ 11259 │ 00_8N500J8UYC                        │ NULL    │
│ 11262 │ 60_8MZ00SBF74                        │ NULL    │
│ 11265 │ 00_8MY00JSO9Q                        │ NULL    │
│ 11266 │ 00_8MT00LFIAA                        │ NULL    │
│ 11273 │ 00_8LV00R7IF0                        │ NULL    │
│ 11306 │ 00_8KE00RENQD                        │ NULL    │
│ 11379 │ 00_8TQ00JOXQF                        │ NULL    │
│ 11386 │ 00_8UC00LLIXE                        │ NULL   

In [59]:
duck.sql("select * from medisoft_firms where rec_id not in (select rec_id from pg.medisoft.table_firms_zoho where id_zoho is not null)")

┌──────────────────────────────────────┬────────────────────────────────────────────────────┬────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────┬─────────┬────────────────┬───────────────┬───────────────┬─────────────────┬───────────────┬───────────────┬───────────────┬────────────────┬──────────────────────────────────────────┬────────────┬─────────────────┬──────────┬─────────┬───────────┬────────────┬─────────────────┬────────────────────────────────┬─────────┬───────────────────────┬────────────┬───────────────┬─────────┬───────────────┬─────────────────┬──────────────────┬──────────────┬────────────────────────────┬─────────┬──────────┬─────────┬───────────────────┬────────────┬─────────────────────────┬────────────┬─────────────┬─────────────┬────────────────────┐
│                rec_id                │                      kuerzel                       │                        name                        │   

In [None]:
duck.sql("select * from medisoft_firms where rec_id in (select rec_id from pg.medisoft.table_firms_zoho where id_zoho is null)").show(max_rows=1000)

In [None]:
from deepparse.parser import AddressParser

In [None]:
duck.sql("""
create or replace temp table medisoft_zoho_name_exact_match as 
    with name_matches as (
        select m.rec_id, z.Id, m.name, m.kuerzel, z.Account_Name, z.Account_Status, * 
        from medisoft_firms m
        join zoho_accounts z
            on m.name = z.Account_Name
    ),
    kuerzel_matches as (
        select m.rec_id, z.Id, m.name, m.kuerzel, z.Account_Name, z.Account_Status, * 
        from medisoft_firms m
        join zoho_accounts z
            on m.kuerzel = z.Account_Name
        where m.rec_id not in (select rec_id from name_matches)
        and z.Id not in (select Id from name_matches)
    )
    select * from name_matches
    union all
    select * from kuerzel_matches
    """
)

In [None]:
duck.sql("select * from medisoft_zoho_name_exact_match")

In [None]:
duck.sql("select Website from zoho_accounts where Website is not null and Website like '%tms-viersen.de%'")

In [None]:
duck.sql("select email from medisoft_firms where email is not null")

In [None]:
duck.sql(
    """
    create or replace temp table medisoft_firms_clean as (
    select rec_id, TRIM(REGEXP_REPLACE(
            REGEXP_REPLACE(
                REGEXP_REPLACE(replace(LOWER(name), '&#38;', ' and '), '[^\w\s]', '', 'g'), 
                '\b(inc|llc|ltd|corp|corporation|limited|gmbh|co|company|sa|sas)\b', '', 'g'
            ), 
            '\s+', ' ', 'g'
        )) AS clean_name
        , TRIM(REGEXP_REPLACE(
            REGEXP_REPLACE(
                REGEXP_REPLACE(replace(LOWER(kuerzel), '&#38;', ' and '), '[^\w\s]', '', 'g'), 
                '\b(inc|llc|ltd|corp|corporation|limited|gmbh|co|company|sa|sas)\b', '', 'g'
            ), 
            '\s+', ' ', 'g'
        )) AS clean_kuerzel,
        
    from medisoft_firms
    )
    """
)

In [None]:
duck.sql(
    """
    create or replace temp table zoho_accounts_clean as (
    select Id, TRIM(REGEXP_REPLACE(
            REGEXP_REPLACE(
                REGEXP_REPLACE(replace(LOWER(Account_Name), '&#38;', ' and '), '[^\w\s]', '', 'g'), 
                '\b(inc|llc|ltd|corp|corporation|limited|gmbh|co|company|sa|sas)\b', '', 'g'
            ), 
            '\s+', ' ', 'g'
        )) AS clean_name from zoho_accounts
    )   
    """
)

In [None]:
duck.sql(
    """
    select * 
    from medisoft_firms_clean m
    join zoho_accounts_clean z
        on m.clean_name = z.clean_name
    """
)