In [None]:
# =============================================================================
# MONADSQUISHY DRAFT 14 - ENGINEER'S COMPANION (ORDER-AWARE)
# =============================================================================

try:
    import pandarallel
    import pyarrow
    import faker
    import ipywidgets as widgets
except ImportError:
    # In a real notebook environment, you might uncomment the line below
    # !pip install pandas numpy pyarrow pandarallel tqdm faker ipywidgets
    pass

import pandas as pd
import numpy as np
import pyarrow as pa
import ipywidgets as widgets
import re
from IPython.display import display, clear_output, Markdown

# --- 1. SETTINGS ---
try:
    from pandarallel import pandarallel
    pandarallel.initialize(progress_bar=True, verbose=0)
    HAS_PARALLEL = True
except ImportError:
    HAS_PARALLEL = False

TYPE_SPECS = {
    "string":   {"default": None,   "dtype": pa.string(),           "coerce": lambda x: x.astype(str)},
    "float":    {"default": np.nan, "dtype": pa.float64(),         "coerce": lambda x: pd.to_numeric(x, errors='coerce')},
    "datetime": {"default": pd.NaT, "dtype": pa.timestamp('ns'), "coerce": lambda x: pd.to_datetime(x, errors='coerce')}
}

# --- 2. CORE ENGINE ---
class Monad:
    __slots__ = ['value', 'input_row', 'output_column', 'status', 'final_value', 'logs', 'stopped', 'step']
    def __init__(self, value, row, col):
        self.value = value; self.input_row = row; self.output_column = col
        self.status = 'pending'; self.final_value = None; self.logs = []; self.stopped = False; self.step = 0

    def _log(self, func, status, details=None):
        role = getattr(func, '_role', 'validator')
        name = getattr(func, '__name__', 'unknown')
        self.logs.append({'row': self.input_row, 'col': self.output_column, 'step': self.step, 'role_type': role, 'role': name, 'status': status, 'details': details})

    def __or__(self, func):
        if self.stopped:
            self.step += 1; self._log(func, 'skipped'); return self
        self.step += 1
        role = getattr(func, '_role', 'validator')
        try:
            res = func(self.value)
            if role == 'validator':
                self.value = res; self.status = 'valid'; self._log(func, 'passed')
            elif role == 'transformer':
                self.value = res; self.status = 'success'; self.final_value = res; self.stopped = True; self._log(func, 'passed')
        except Exception as e:
            if role == 'validator':
                self.status = 'dirty'; self.final_value = None; self.stopped = True; self._log(func, 'failed', str(e))
            elif role == 'transformer':
                self._log(func, 'failed', str(e))
        return self

    def apply(self, pipeline):
        transformer_seen = False
        for func in pipeline:
            if getattr(func, '_role', 'validator') == 'transformer': transformer_seen = True
            self | func

        if self.status == 'success': pass
        elif self.status == 'dirty': self.final_value = None
        elif self.status in ['valid', 'pending']:
            if transformer_seen:
                self.status = 'dirty'; self.final_value = None
                self.logs.append({'row': self.input_row, 'col': self.output_column, 'step': self.step + 1, 'role_type': 'system', 'role': 'chain_exhausted', 'status': 'failed', 'details': 'All transformers failed'})
            else:
                self.status = 'success'; self.final_value = self.value
        return self

class SquishyEngine:
    def __init__(self, config_list, source_df):
        self.config = config_list; self.df = source_df; self.logs = []; self.final_df = None

    def run(self):
        print(f"Processing {len(self.df)} rows...")
        final_df = self.df.copy()
        all_logs = []
        for col_def in self.config:
            target = col_def['target']; source = col_def.get('source', target)
            pipeline = col_def['pipeline']; spec = TYPE_SPECS.get(col_def.get('type', 'string'))
            if source not in self.df.columns: continue

            def process(row):
                m = Monad(row[source], row.name, target).apply(pipeline)
                return (m.final_value if m.status == 'success' else spec['default'], m.logs)

            if HAS_PARALLEL: results = self.df.parallel_apply(process, axis=1)
            else: results = self.df.apply(process, axis=1)

            final_df[target] = results.apply(lambda x: x[0])
            for log_list in results.apply(lambda x: x[1]): all_logs.extend(log_list) #need to be improvedtangable
            final_df[target] = spec['coerce'](final_df[target])

        self.final_df = final_df
        self.logs = pd.DataFrame(all_logs)
        return final_df

# --- 3. TOOLKIT ---
def validator(f): f._role = 'validator'; return f
def transformer(f): f._role = 'transformer'; return f

@validator
def must_exist(v):
    if pd.isna(v) or str(v).strip() == '' or str(v).lower() in ['n/a', 'null', 'nan']: raise Exception("Missing")
    return v

@transformer
def to_iso_code(v):
    m = {'TH': 'TH', 'USA': 'US', 'UK': 'UK', 'JP': 'JP', 'CN': 'CN'}
    res = m.get(str(v).strip().upper(), None)
    if res is None: raise Exception("Mapping failed")
    return res

@transformer
def to_float(v): return float(str(v).replace(',', '').replace('$', '').replace('à¸¿', ''))

# Date Parsers
@transformer
def parse_iso(v): return pd.to_datetime(v, format='%Y-%m-%d')
@transformer
def parse_us(v): return pd.to_datetime(v, format='%m/%d/%Y')
@transformer
def parse_uk(v): return pd.to_datetime(v, format='%d/%m/%Y')
@transformer
def parse_thai(v):
    s = str(v); parts = s.split('-')
    if len(parts) == 3 and int(parts[0]) > 2400:
        return pd.to_datetime(f"{int(parts[0])-543}-{parts[1]}-{parts[2]}")
    raise Exception("Not Thai")

# --- 4. DATA GENERATOR ---
def generate_complex_data(n=3000):
    print(f"Generating {n} rows of complex data...")
    # Weighted to make ISO (last in config) the most common (should be first)
    dates = np.random.choice(['2024-01-01', '01/31/2024', '31/01/2024', '2567-01-01', 'NotDate'], n, p=[0.6, 0.1, 0.1, 0.1, 0.1])
    countries = np.random.choice(['TH', 'USA', 'UK', 'JP', 'BadCode', None], n, p=[0.6, 0.2, 0.1, 0.05, 0.025, 0.025])
    prices = np.random.choice(['100', '$50.00', '1,000', 'Free', None], n, p=[0.5, 0.3, 0.1, 0.05, 0.05])
    return pd.DataFrame({'date_col': dates, 'country_col': countries, 'price_col': prices})

# --- 5. DASHBOARD ---
def show_dashboard(engine, df_orig, config):
    if engine.logs.empty: print("No logs generated."); return
    style = {'description_width': 'initial'}

    chk_input = widgets.Checkbox(value=True, description='1. Input Head')
    chk_output = widgets.Checkbox(value=True, description='2. Output Head')
    chk_quality = widgets.Checkbox(value=True, description='3. Quality Summary')
    chk_backlog = widgets.Checkbox(value=False, description='4. Dirty Backlog')
    chk_recommender = widgets.Checkbox(value=False, description='5. AI Recommender')
    chk_chain = widgets.Checkbox(value=False, description='6. Operation Chain Analysis')

    out_input = widgets.Output(); out_output = widgets.Output(); out_quality = widgets.Output()
    out_backlog = widgets.Output(); out_recommender = widgets.Output(); out_chain = widgets.Output()

    def render(change=None):
        out_input.clear_output(); out_output.clear_output(); out_quality.clear_output();
        out_backlog.clear_output(); out_recommender.clear_output(); out_chain.clear_output()

        if chk_input.value:
            with out_input: display(Markdown("#### 1. Input Data")); display(df_orig.head())

        if chk_output.value:
            with out_output:
                display(Markdown("#### 2. Output Data"))
                target_cols = [c['target'] for c in config]
                cols_to_show = [c for c in target_cols if c in engine.final_df.columns]
                display(engine.final_df[cols_to_show].head())

        if chk_quality.value:
            with out_quality:
                display(Markdown("#### 3. Quality Summary"))
                summary = []
                for col in engine.logs['col'].unique():
                    col_logs = engine.logs[engine.logs['col'] == col]
                    missing = col_logs[(col_logs['role'] == 'must_exist') & (col_logs['status'] == 'failed')]['row'].nunique()
                    dirty = col_logs[(col_logs['status'] == 'failed') & ((col_logs['role_type'] == 'validator') | (col_logs['role'] == 'chain_exhausted'))]
                    dirty = dirty[dirty['role'] != 'must_exist']
                    summary.append({'Column': col, 'Passed': len(df_orig) - missing - dirty['row'].nunique(), 'Missing': missing, 'Invalid': dirty['row'].nunique()})
                display(pd.DataFrame(summary).style.background_gradient(cmap='RdYlGn', subset=['Passed']))

        if chk_backlog.value:
            with out_backlog:
                display(Markdown("#### 4. Dirty Data Backlog"))
                fails = engine.logs[(engine.logs['status'] == 'failed') & ((engine.logs['role_type'] == 'validator') | (engine.logs['role'] == 'chain_exhausted'))]
                if fails.empty: print("No backlog.")
                else:
                    for col_def in config:
                        target = col_def['target']; source = col_def.get('source', target)
                        col_fails = fails[fails['col'] == target]
                        if col_fails.empty: continue
                        fail_idxs = col_fails['row'].unique()
                        bad_vals = df_orig.loc[fail_idxs, source].fillna("NULL")
                        top = bad_vals.value_counts(dropna=False).head(5).reset_index(); top.columns = [f"Original ('{source}')", "Count"]
                        display(Markdown(f"**{target}**")); display(top.style.bar(subset=['Count'], color='#ffcccc'))

        if chk_recommender.value:
            with out_recommender:
                display(Markdown("#### 5. AI Recommender (Correct Order Logic)"))

                stats = engine.logs.groupby(['col', 'role', 'role_type'])['status'].value_counts().unstack(fill_value=0)
                if 'passed' not in stats.columns: stats['passed'] = 0
                stats['E-Score'] = stats['passed'] / len(df_orig)
                trans_stats = stats[stats.index.get_level_values('role_type') == 'transformer'].reset_index()

                display_frames = []

                for col_def in config:
                    target = col_def['target']
                    config_order = [getattr(f, '__name__') for f in col_def['pipeline'] if getattr(f, '_role') == 'transformer']
                    if not config_order: continue

                    col_df = trans_stats[trans_stats['col'] == target].copy()
                    col_df['role'] = pd.Categorical(col_df['role'], categories=config_order, ordered=True)
                    col_df = col_df.sort_values('role').dropna()
                    col_df['Ideal Rank'] = col_df['E-Score'].rank(ascending=False, method='first').astype(int)
                    col_df['Current Step'] = range(1, len(col_df) + 1)

                    def suggest(row):
                        diff = row['Current Step'] - row['Ideal Rank']
                        if diff == 0: return "âœ… Optimal"
                        if diff > 0: return f"ðŸ”¼ Move UP to #{row['Ideal Rank']}"
                        return f"ðŸ”½ Move DOWN to #{row['Ideal Rank']}"

                    col_df['Suggestion'] = col_df.apply(suggest, axis=1)
                    display_frames.append(col_df[['col', 'role', 'E-Score', 'Current Step', 'Ideal Rank', 'Suggestion']])

                if display_frames:
                    final_rec = pd.concat(display_frames)
                    display(final_rec.style.bar(subset=['E-Score'], color='#d4edda', vmin=0, vmax=1))
                else:
                    print("No transformers to analyze.")

        if chk_chain.value:
            with out_chain:
                display(Markdown("#### 6. Operation Chain Analysis"))
                display(Markdown("> **Info:** 'Active Input' = Rows that actually ran this function (not skipped)."))

                for col in engine.logs['col'].unique():
                    display(Markdown(f"**{col}**"))

                    # Group by Step and Role to maintain order
                    chain_df = engine.logs[engine.logs['col'] == col].groupby(['step', 'role', 'role_type'])['status'].value_counts().unstack(fill_value=0)

                    # Normalize columns
                    for c in ['passed', 'failed', 'skipped']:
                        if c not in chain_df.columns: chain_df[c] = 0

                    # Metrics
                    chain_df['Active Input'] = chain_df['passed'] + chain_df['failed']
                    chain_df['Success % (of Active)'] = (chain_df['passed'] / chain_df['Active Input']).fillna(0).mul(100).round(1).astype(str) + '%'

                    # Display
                    display(chain_df[['passed', 'failed', 'skipped', 'Active Input', 'Success % (of Active)']].style.background_gradient(subset=['passed'], cmap='Greens'))

    for w in [chk_input, chk_output, chk_quality, chk_backlog, chk_recommender, chk_chain]: w.observe(render, names='value')
    render()

    ui = widgets.VBox([
        widgets.HBox([chk_input, chk_output, chk_quality]),
        widgets.HBox([chk_backlog, chk_recommender, chk_chain]),
        widgets.HTML('<hr>'),
        out_input, out_output, out_quality, out_backlog, out_recommender, out_chain
    ])
    display(ui)

# --- 6. MAIN ---
def main():
    df = generate_complex_data(3000)
    config = [
        # INTENTIONALLY BAD ORDER: ISO (Most common) is LAST
        {"target": "clean_date", "source": "date_col", "type": "datetime", "pipeline": [must_exist, parse_thai, parse_uk, parse_us, parse_iso]},
        {"target": "country_code", "source": "country_col", "type": "string", "pipeline": [must_exist, to_iso_code]},
        {"target": "price", "source": "price_col", "type": "float", "pipeline": [must_exist, to_float]}
    ]
    engine = SquishyEngine(config, df)
    engine.run()
    show_dashboard(engine, df, config)

if __name__ == "__main__":
    main()

Generating 3000 rows of complex data...
Processing 3000 rows...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3000), Label(value='0 / 3000'))),)â€¦

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3000), Label(value='0 / 3000'))),)â€¦

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3000), Label(value='0 / 3000'))),)â€¦

VBox(children=(HBox(children=(Checkbox(value=True, description='1. Input Head'), Checkbox(value=True, descriptâ€¦