In [1]:
pip install monadsquishy -U

Note: you may need to restart the kernel to use updated packages.


# load Input

In [2]:
import pandas as pd
df = pd.read_parquet('./simple.parquet')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     5 non-null      object
 1   name        5 non-null      object
 2   order_date  5 non-null      object
 3   quantity    5 non-null      object
 4   price       5 non-null      object
dtypes: object(5)
memory usage: 328.0+ bytes


Unnamed: 0,country,name,order_date,quantity,price
0,TH,มานี,9/25/2024,5 ชิ้น,$25.99
1,United States,John Smith,20240926,7 units,30.00 USD
2,ไทย,ปิติ,27-09-2024,pcs: 4,"1,025.99 THB"
3,อังกฤษ,"Smith, J.","Sep 28, 2024",10,"1,000,025.99 บาท"
4,invalid name,J.S.,29 ก.ย. 2567,5pc,๒๗.๙๙ บาท


# Create a Squishy transformation

In [3]:
from monadsquishy import Squishy, sf
sq_config = {
    'transformations':[
        {
            'input_table': df,
            'transformed_path':'./staging/test1',
            'exploded_path':'./staging/test1',
            'out_columns': {
                 'country_output': {
                     'input':'country',
                     'funcs':[sf.country1,sf.country2,sf.country3]
                 },
                 'name': {
                     'input':'name',
                     'funcs':[lambda x:x, ],
                 },
                 'order_date': {
                     'input':'order_date',
                     'funcs':[sf.date1, sf.date2]
                 },
                 'quantity': {
                     'input':'quantity',
                     'funcs':[sf.quantity1, ]
                 },
                 'price_number': {
                     'input':'price',
                     'funcs':[sf.price1, ]
                 },
                 'price_currency': {
                     'input':'price',
                     'funcs':[sf.currency1, ]
                 } 
             }
        }
    ]
}
sq=Squishy(sq_config)
sq.run()

1/6 Output: country_output
Input: country             
Process: ['country1', 'country2', 'country3']


100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14393.63it/s]


2/6 Output: name
Input: name                
Process: ['<lambda>']


100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13374.69it/s]


3/6 Output: order_date
Input: order_date          
Process: ['date1', 'date2']


100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 606.38it/s]


4/6 Output: quantity
Input: quantity            
Process: ['quantity1']


100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 15465.72it/s]


5/6 Output: price_number
Input: price               
Process: ['price1']


100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 16657.28it/s]


6/6 Output: price_currency
Input: price               
Process: ['currency1']


100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10661.68it/s]

>> Finished transformations!





In [4]:
sq.clean_report()

Unnamed: 0,input_column,output_column,message,clean_count
0,country,country_output,Passed: country1(),1
1,country,country_output,Passed: country2(),1
2,country,country_output,Passed: country3(),2
3,name,name,Passed: <lambda>(),5
4,order_date,order_date,Passed: date1(),4
5,order_date,order_date,Passed: date2(),1
8,quantity,quantity,Passed: quantity1(),5
7,price,price_number,Passed: price1(),5
6,price,price_currency,Passed: currency1(),5


In [5]:
sq.dirty_report()

Unnamed: 0,input_column,output_column,input_value,dirty_count
0,country,country_output,invalid name,1


In [6]:
df_log = sq.log()
df_log.info()
df_log

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   input_row      38 non-null     int64 
 1   input_column   38 non-null     object
 2   output_column  38 non-null     object
 3   input_value    38 non-null     object
 4   output_value   38 non-null     object
 5   is_passed      38 non-null     bool  
 6   message        38 non-null     object
dtypes: bool(1), int64(1), object(5)
memory usage: 1.9+ KB


Unnamed: 0,input_row,input_column,output_column,input_value,output_value,is_passed,message
0,0,country,country_output,TH,TH,True,Passed: country1()
1,1,country,country_output,United States,USA,False,Failed: country1(): Invalid code
2,1,country,country_output,United States,USA,True,Passed: country2()
3,2,country,country_output,ไทย,TH,False,Failed: country1(): Invalid code
4,2,country,country_output,ไทย,TH,False,Failed: country2(): 'ไทย'
5,2,country,country_output,ไทย,TH,True,Passed: country3()
6,3,country,country_output,อังกฤษ,UK,False,Failed: country1(): Invalid code
7,3,country,country_output,อังกฤษ,UK,False,Failed: country2(): 'อังกฤษ'
8,3,country,country_output,อังกฤษ,UK,True,Passed: country3()
9,4,country,country_output,invalid name,,False,Failed: country1(): Invalid code


# For developmet

In [7]:
## dirty report
df_last = df_log.drop_duplicates(['input_row','output_column','input_value'], keep='last')
# Filter the dataframe for rows where 'is_passed' is False
df_not_passed = df_last[df_last['is_passed'] == False]

# Create the pivot table to count occurrences of failed rows
df_pivot_report = pd.pivot_table(
    df_not_passed,
    values='is_passed',  # The value to aggregate
    index=['input_column', 'output_column', 'input_value'],  # Grouping columns
    aggfunc='count',  # Aggregate function to count occurrences
    dropna=False,  # Do not drop missing values
    # fill_value=None  # Use NaN when there are no values
)

# Resetting the index to flatten the pivot table
df_pivot_report = df_pivot_report.reset_index()

# Renaming the columns for clarity
df_pivot_report.columns = ['input_column', 'out_column', 'input_value', 'dirty_count']
df_pivot_report = df_pivot_report.sort_values(['out_column','dirty_count'], ascending=False)
df_pivot_report

Unnamed: 0,input_column,out_column,input_value,dirty_count
0,country,country_output,invalid name,1


In [8]:
## clean_report
df_not_passed = df_log[df_log['is_passed'] == True]
# Create the pivot table to count occurrences of failed rows
df_pivot_report = pd.pivot_table(
    df_not_passed,
    values='is_passed',  # The value to aggregate
    index=['input_column', 'output_column', 'message'],  # Grouping columns
    aggfunc='count',  # Aggregate function to count occurrences
    # dropna=False,  # Do not drop missing values
    fill_value=None  # Use NaN when there are no values
)

# Resetting the index to flatten the pivot table
df_pivot_report = df_pivot_report.reset_index()

# Renaming the columns for clarity
df_pivot_report.columns = ['input_column', 'out_column', 'message', 'clean_count']
df_pivot_report = df_pivot_report.sort_values(['out_column','clean_count'], ascending=False)
df_pivot_report

Unnamed: 0,input_column,out_column,message,clean_count
8,quantity,quantity,Passed: quantity1(),5
7,price,price_number,Passed: price1(),5
6,price,price_currency,Passed: currency1(),5
4,order_date,order_date,Passed: date1(),4
5,order_date,order_date,Passed: date2(),1
3,name,name,Passed: <lambda>(),5
2,country,country_output,Passed: country3(),2
0,country,country_output,Passed: country1(),1
1,country,country_output,Passed: country2(),1
