In [2]:
%load_ext dotenv
%dotenv
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [16]:
from pathlib import Path
import ast
from functools import partial

from cmf import clean, process
from cmf import locations as loc
from cmf.clean import steps
from cmf.clean import utils as cu

from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import pandas as pd
import duckdb

engine = create_engine("postgresql://", echo=False)
engine.dispose()

# Cleaning tests

Just playing with unit tests.

In [12]:
def load_test_data(path):
    dirty = pd.read_csv(Path(path, "dirty.csv"), converters={"list": ast.literal_eval})
    clean = pd.read_csv(Path(path, "clean.csv"), converters={"list": ast.literal_eval})
    dirty.columns = ["col"]
    clean.columns = ["col"]

    return dirty, clean

In [19]:
expand_abbreviations_partial = partial(
    steps.expand_abbreviations, 
    replacements={"co": "company", "ltd": "limited"}
)

In [None]:
steps.expand_abbreviations(

In [49]:
dirty, cleaned = load_test_data(
    Path(loc.PROJECT_DIR, "test", "cleaning", "unnest_renest", "expand_abbreviations")
)

In [14]:
dirty

Unnamed: 0,col
0,"[foo, foo co]"
1,"[bar ltd, ltd bar]"
2,"[bar ltd, ltd bar]"
3,[baz]
4,[co qux]


In [50]:
cleaned

Unnamed: 0,col
0,"[foo, foo company]"
1,"[bar limited, limited bar]"
2,"[bar limited, limited bar]"
3,[baz]
4,[company qux]


In [51]:
test_cleaning_function_arrayed = cu.unnest_renest(
    cu.cleaning_function(
        expand_abbreviations_partial
    )
)

clean_out = test_cleaning_function_arrayed(dirty, column="col")

In [53]:
clean_out.equals(cleaned)

False

In [61]:
clean_out.sort_values(by="col").reset_index(drop=True).equals(
    cleaned.sort_values(by="col").reset_index(drop=True)
)

True

In [66]:
clean_out.col.sort_values().eq(cleaned.col.sort_values())

0    False
1     True
2    False
3    False
4    False
Name: col, dtype: bool

In [67]:
clean_out.col.sort_values()
cleaned.col.sort_values()

1    [bar limited, limited bar]
4    [bar limited, limited bar]
2                         [baz]
0                 [company qux]
3            [foo, foo company]
Name: col, dtype: object

1    [bar limited, limited bar]
2    [bar limited, limited bar]
3                         [baz]
4                 [company qux]
0            [foo, foo company]
Name: col, dtype: object

In [59]:
clean_out.sort_values(by="col").reset_index(drop=True)

Unnamed: 0,col
0,"[bar limited, limited bar]"
1,"[bar limited, limited bar]"
2,[baz]
3,[company qux]
4,"[foo, foo company]"


In [60]:
cleaned.sort_values(by="col").reset_index(drop=True)

Unnamed: 0,col
0,"[bar limited, limited bar]"
1,"[bar limited, limited bar]"
2,[baz]
3,[company qux]
4,"[foo, foo company]"


In [27]:
steps.expand_abbreviations(dirty)

"\n                regexp_replace(\n                    \n                regexp_replace(\n                    lower(                  col\n0       [foo, foo co]\n1  [bar ltd, ltd bar]\n2  [bar ltd, ltd bar]\n3               [baz]\n4            [co qux]),\n                    '\\b(co)\\b',\n                    'company',\n                    'g'\n                )\n            ,\n                    '\\b(ltd)\\b',\n                    'limited',\n                    'g'\n                )\n            "

In [29]:
test_clean = cu.cleaning_function(expand_abbreviations_partial)

test_clean(dirty, column="col")

Unnamed: 0,col
0,"[foo, foo company]"
1,"[bar limited, limited bar]"
2,"[bar limited, limited bar]"
3,[baz]
4,[company qux]


In [33]:
df = duckdb.sql("""
    select
        unnest(col) as col
    from
        dirty
""").df()

In [35]:
df

Unnamed: 0,col
0,foo
1,foo co
2,bar ltd
3,ltd bar
4,bar ltd
5,ltd bar
6,baz
7,co qux


In [34]:
test_clean(df, column="col")

Unnamed: 0,col
0,foo
1,foo company
2,bar limited
3,limited bar
4,bar limited
5,limited bar
6,baz
7,company qux


In [39]:
cleaned

Unnamed: 0,col
0,"[foo, foo company]"
1,"[bar limited, limited bar]"
2,"[bar limited, limited bar]"
3,[baz]
4,[company qux]


In [45]:
test_func_2 = cu.unnest_renest(
    cu.cleaning_function(steps.to_upper)
)

In [46]:
test_func_2(dirty, column="col")

Unnamed: 0,col
0,[CO QUX]
1,"[FOO, FOO CO]"
2,"[BAR LTD, LTD BAR]"
3,[BAZ]
4,"[BAR LTD, LTD BAR]"
