In [None]:
# !pip install pandas numpy pyarrow pandarallel altair

In [None]:
# =============================================================================
# Draft15 MONADSQUISHY VISUAL - WITH VEGA-LITE REPORTS
# =============================================================================

try:
    import pandarallel
    import altair as alt
    import pandas as pd
    import numpy as np
    import pyarrow as pa
except ImportError:
    # !pip install pandas numpy pyarrow pandarallel altair
    pass

# --- 1. SETTINGS ---
try:
    from pandarallel import pandarallel
    pandarallel.initialize(progress_bar=True, verbose=0)
    HAS_PARALLEL = True
except ImportError:
    HAS_PARALLEL = False

# Ensure Altair renders in the notebook
try:
    alt.renderers.enable('default')
except:
    pass

TYPE_SPECS = {
    "string":   {"default": None,   "dtype": pa.string(),           "coerce": lambda x: x.astype(str)},
    "float":    {"default": np.nan, "dtype": pa.float64(),         "coerce": lambda x: pd.to_numeric(x, errors='coerce')},
    "datetime": {"default": pd.NaT, "dtype": pa.timestamp('ns'), "coerce": lambda x: pd.to_datetime(x, errors='coerce')}
}

# --- 2. CORE ENGINE ---
class Monad:
    __slots__ = ['value', 'input_row', 'output_column', 'status', 'final_value', 'logs', 'stopped', 'step']
    def __init__(self, value, row, col):
        self.value = value; self.input_row = row; self.output_column = col
        self.status = 'pending'; self.final_value = None; self.logs = []; self.stopped = False; self.step = 0

    def _log(self, func, status, details=None):
        role = getattr(func, '_role', 'validator')
        name = getattr(func, '__name__', 'unknown')
        self.logs.append({'row': self.input_row, 'col': self.output_column, 'step': self.step, 'role_type': role, 'role': name, 'status': status, 'details': details})

    def __or__(self, func):
        if self.stopped:
            self.step += 1; self._log(func, 'skipped'); return self
        self.step += 1
        role = getattr(func, '_role', 'validator')
        try:
            res = func(self.value)
            if role == 'validator':
                self.value = res; self.status = 'valid'; self._log(func, 'passed')
            elif role == 'transformer':
                self.value = res; self.status = 'success'; self.final_value = res; self.stopped = True; self._log(func, 'passed')
        except Exception as e:
            if role == 'validator':
                self.status = 'dirty'; self.final_value = None; self.stopped = True; self._log(func, 'failed', str(e))
            elif role == 'transformer':
                self._log(func, 'failed', str(e))
        return self

    def apply(self, pipeline):
        transformer_seen = False
        for func in pipeline:
            if getattr(func, '_role', 'validator') == 'transformer': transformer_seen = True
            self | func

        if self.status == 'success': pass
        elif self.status == 'dirty': self.final_value = None
        elif self.status in ['valid', 'pending']:
            if transformer_seen:
                self.status = 'dirty'; self.final_value = None
                # This step is critical: It marks the "End of Line" failure
                self.logs.append({'row': self.input_row, 'col': self.output_column, 'step': self.step + 1, 'role_type': 'system', 'role': 'chain_exhausted', 'status': 'failed', 'details': 'All transformers failed'})
            else:
                self.status = 'success'; self.final_value = self.value
        return self

class SquishyEngine:
    def __init__(self, config_list, source_df):
        self.config = config_list; self.df = source_df; self.logs = []; self.final_df = None

    def run(self):
        print(f"Processing {len(self.df)} rows...")
        final_df = self.df.copy()
        all_logs = []
        for col_def in self.config:
            target = col_def['target']; source = col_def.get('source', target)
            pipeline = col_def['pipeline']; spec = TYPE_SPECS.get(col_def.get('type', 'string'))
            if source not in self.df.columns: continue

            def process(row):
                m = Monad(row[source], row.name, target).apply(pipeline)
                return (m.final_value if m.status == 'success' else spec['default'], m.logs)

            if HAS_PARALLEL: results = self.df.parallel_apply(process, axis=1)
            else: results = self.df.apply(process, axis=1)

            final_df[target] = results.apply(lambda x: x[0])
            for log_list in results.apply(lambda x: x[1]): all_logs.extend(log_list)
            final_df[target] = spec['coerce'](final_df[target])

        self.final_df = final_df
        self.logs = pd.DataFrame(all_logs)
        return final_df

# --- 3. TOOLKIT ---
def validator(f): f._role = 'validator'; return f
def transformer(f): f._role = 'transformer'; return f

@validator
def must_exist(v):
    if pd.isna(v) or str(v).strip() == '' or str(v).lower() in ['n/a', 'null', 'nan']: raise Exception("Missing")
    return v

@transformer
def to_iso_code(v):
    m = {'TH': 'TH', 'USA': 'US', 'UK': 'UK', 'JP': 'JP', 'CN': 'CN'}
    res = m.get(str(v).strip().upper(), None)
    if res is None: raise Exception("Mapping failed")
    return res

@transformer
def to_float(v): return float(str(v).replace(',', '').replace('$', '').replace('à¸¿', ''))

@transformer
def parse_iso(v): return pd.to_datetime(v, format='%Y-%m-%d')
@transformer
def parse_us(v): return pd.to_datetime(v, format='%m/%d/%Y')
@transformer
def parse_uk(v): return pd.to_datetime(v, format='%d/%m/%Y')
@transformer
def parse_thai(v):
    s = str(v); parts = s.split('-')
    if len(parts) == 3 and int(parts[0]) > 2400:
        return pd.to_datetime(f"{int(parts[0])-543}-{parts[1]}-{parts[2]}")
    raise Exception("Not Thai")

# --- 4. DATA GENERATOR ---
def generate_complex_data(n=3000):
    print(f"Generating {n} rows of complex data...")
    dates = np.random.choice(['2024-01-01', '01/31/2024', '31/01/2024', '2567-01-01', 'NotDate'], n, p=[0.6, 0.1, 0.1, 0.1, 0.1])
    countries = np.random.choice(['TH', 'USA', 'UK', 'JP', 'BadCode', None], n, p=[0.6, 0.2, 0.1, 0.05, 0.025, 0.025])
    prices = np.random.choice(['100', '$50.00', '1,000', 'Free', None], n, p=[0.5, 0.3, 0.1, 0.05, 0.05])
    return pd.DataFrame({'date_col': dates, 'country_col': countries, 'price_col': prices})

# --- 5. VISUAL DASHBOARD (VEGA-LITE) ---
def show_visual_dashboard(engine, df_orig, config):
    if engine.logs.empty: print("No logs generated."); return

    print("\nGenerating Vega-Lite Reports...")
    charts = []

    # --- CHART 1: Quality Summary ---
    summary_data = []
    for col in engine.logs['col'].unique():
        col_logs = engine.logs[engine.logs['col'] == col]
        missing = col_logs[(col_logs['role'] == 'must_exist') & (col_logs['status'] == 'failed')]['row'].nunique()
        dirty = col_logs[(col_logs['status'] == 'failed') & ((col_logs['role_type'] == 'validator') | (col_logs['role'] == 'chain_exhausted'))]
        dirty = dirty[dirty['role'] != 'must_exist']['row'].nunique()
        passed = len(df_orig) - missing - dirty

        summary_data.append({'Column': col, 'Status': 'Passed', 'Count': passed})
        summary_data.append({'Column': col, 'Status': 'Missing', 'Count': missing})
        summary_data.append({'Column': col, 'Status': 'Invalid', 'Count': dirty})

    df_summary = pd.DataFrame(summary_data)

    # Explicit sorting for the stacked bar: Passed (Left) -> Invalid (Middle) -> Missing (Right)
    # We use a numeric rank to force this order in Altair
    status_order = {'Passed': 0, 'Invalid': 1, 'Missing': 2}
    df_summary['rank'] = df_summary['Status'].map(status_order)

    chart_quality = alt.Chart(df_summary, title="1. Data Quality Summary").mark_bar().encode(
        x=alt.X('Count:Q', stack='normalize', axis=alt.Axis(format='%')),
        y=alt.Y('Column:N'),
        color=alt.Color('Status:N', scale=alt.Scale(domain=['Passed', 'Missing', 'Invalid'], range=['#2ca02c', '#ff7f0e', '#d62728'])),
        order=alt.Order('rank', sort='ascending'),
        tooltip=['Column', 'Status', 'Count']
    ).properties(width=600, height=150)
    charts.append(chart_quality)

    # --- CHART 2: Operation Chain Analysis ---
    chain_data = []
    for col in engine.logs['col'].unique():
        # Group by step, role AND role_type to differentiate validators
        grp = engine.logs[engine.logs['col'] == col].groupby(['step', 'role', 'role_type'])['status'].value_counts().reset_index(name='Count')
        grp['Column'] = col

        # Logic to separate Validator Pass (Blue) vs Transformer Pass (Green)
        def get_display_status(row):
            if row['status'] == 'passed' and row['role_type'] == 'validator':
                return 'validation_pass'
            return row['status']

        grp['display_status'] = grp.apply(get_display_status, axis=1)

        # Explicit Ranking for Y-Axis Sorting (Pipeline Step)
        grp['rank'] = grp['step']
        grp.loc[grp['role'] == 'chain_exhausted', 'rank'] = 999

        # Explicit Ranking for Stack Order
        # 0. Skipped (Left/Bottom)
        # 1. Validation Pass (Blue)
        # 2. Transformer Pass (Green)
        # 3. Failed (Right/Top)
        status_map = {'skipped': 0, 'validation_pass': 1, 'passed': 2, 'failed': 3}
        grp['status_rank'] = grp['display_status'].map(status_map)

        chain_data.append(grp)

    df_chain = pd.concat(chain_data)

    chart_chain = alt.Chart(df_chain, title="2. Operation Chain Flow (Pass/Fail/Skip)").mark_bar().encode(
        x=alt.X('Count:Q', stack='normalize'),
        y=alt.Y('role:N', sort=alt.EncodingSortField(field="rank", op="min", order="ascending"), title="Pipeline Step"),
        # Update Color Scale to include Blue for Validation Pass
        color=alt.Color('display_status:N',
                        scale=alt.Scale(
                            domain=['passed', 'validation_pass', 'failed', 'skipped'],
                            range=['#2ca02c', '#1f77b4', '#d62728', '#c7c7c7']
                        ),
                        legend=alt.Legend(title="Status")
        ),
        # Apply the custom sort order for the stack
        order=alt.Order('status_rank', sort='ascending'),
        row=alt.Row('Column:N', header=alt.Header(titleOrient="top", labelOrient="top")),
        tooltip=['Column', 'step', 'role', 'display_status', 'Count']
    ).properties(width=600, height=100).resolve_scale(y='independent')
    charts.append(chart_chain)

    # --- CHART 3: Recommender ---
    stats = engine.logs.groupby(['col', 'role', 'role_type'])['status'].value_counts().unstack(fill_value=0)
    if 'passed' not in stats.columns: stats['passed'] = 0
    stats['E-Score'] = stats['passed'] / len(df_orig)
    trans_stats = stats[stats.index.get_level_values('role_type') == 'transformer'].reset_index()

    # Create a base chart to share encoding between bars and text
    base = alt.Chart(trans_stats).encode(
        x=alt.X('E-Score:Q', scale=alt.Scale(domain=[0, 1])),
        y=alt.Y('role:N', sort='-x'),
        tooltip=['col', 'role', 'E-Score']
    )

    # 1. Green bars
    bars = base.mark_bar().encode(
        color=alt.value('#2ca02c')
    )

    # 2. Text labels showing the output column (Context)
    text = base.mark_text(
        align='left',
        baseline='middle',
        dx=3,  # Shift text slightly to the right of the bar
        color='black'
    ).encode(
        text='col:N'
    )

    # Combine bars and text
    chart_rec = (bars + text).properties(
        title="3. AI Recommender (Efficiency Scores)",
        width=600,
        height=200
    )

    charts.append(chart_rec)

    # --- DISPLAY ---
    final_dashboard = alt.vconcat(*charts).resolve_scale(color='independent')
    display(final_dashboard)

# --- 6. MAIN ---
def main():
    df = generate_complex_data(3000)
    config = [
        # OPTIMIZED ORDER: ISO (Most common) is FIRST
        {"target": "clean_date", "source": "date_col", "type": "datetime", "pipeline": [must_exist, parse_iso, parse_us, parse_uk, parse_thai]},
        {"target": "country_code", "source": "country_col", "type": "string", "pipeline": [must_exist, to_iso_code]},
        {"target": "price", "source": "price_col", "type": "float", "pipeline": [must_exist, to_float]}
    ]
    engine = SquishyEngine(config, df)
    engine.run()
    show_visual_dashboard(engine, df, config)

if __name__ == "__main__":
    main()