In [10]:
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check

df = pd.read_csv("../data/raw/churn.csv")

# --- Step 2 rules: schema + checks ---
schema = DataFrameSchema(
    {
        # identifiers
        "customerID": Column(str, nullable=False, unique=True),

        # label
        "Churn": Column(str, nullable=False, checks=Check.isin(["Yes", "No"])),

        # numeric / ordinal
        "tenure": Column(int, nullable=False, checks=Check.between(0, 100)),

        # charges
        "MonthlyCharges": Column(float, nullable=False, checks=Check.between(0, 500)),

        # TotalCharges in raw is string; validate convertible-to-float
        "TotalCharges": Column(
            object,
            nullable=True,
            checks=Check.str_matches(r"^\s*\d+(\.\d+)?\s*$|^\s*$"),
        ),

        # service columns (types stay as object for now)
        "InternetService": Column(object, nullable=False),
        "OnlineSecurity": Column(object, nullable=False),
        "OnlineBackup": Column(object, nullable=False),

        "PhoneService": Column(object, nullable=False),
        "MultipleLines": Column(object, nullable=False),
    },
    strict=False  # 不要求你列出全部欄位，避免你現在被 schema 綁死
)

# --- Run schema validation (fail-fast) ---
schema.validate(df, lazy=False)
print("Schema validation passed.")


Schema validation passed.


top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



In [15]:
import pandas as pd
import numpy as np

class DataValidationError(Exception):
    pass

def fail(msg: str):
    raise DataValidationError(msg)

def assert_true(cond: bool, msg: str):
    if not cond:
        fail(msg)

def assert_series(cond_series: pd.Series, msg: str, df: pd.DataFrame, show_cols=None, n=5):
    """
    cond_series: True means PASS for that row. False means FAIL.
    """
    if not cond_series.all():
        bad = df.loc[~cond_series]
        if show_cols is None:
            show_cols = bad.columns.tolist()[:6]
        sample = bad[show_cols].head(n).to_string(index=False)
        fail(f"{msg}\nFailed rows: {len(bad)} / {len(df)}\nSample:\n{sample}")

def validate_churn_df(df: pd.DataFrame) -> None:
    # ---- Required columns ----
    required = [
        "customerID", "Churn", "tenure",
        "MonthlyCharges", "TotalCharges",
        "InternetService", "OnlineSecurity", "OnlineBackup",
        "PhoneService", "MultipleLines"
    ]
    missing_cols = [c for c in required if c not in df.columns]
    assert_true(len(missing_cols) == 0, f"Missing required columns: {missing_cols}")

    # ---- Table not empty ----
    assert_true(len(df) > 0, "Dataset is empty (0 rows).")

    # ---- customerID: not null + unique ----
    assert_series(df["customerID"].notna(), "customerID contains nulls.", df, ["customerID"])
    assert_true(df["customerID"].is_unique, "customerID is not unique (duplicate IDs exist).")

    # ---- Label: not null + in set ----
    assert_series(df["Churn"].notna(), "Churn contains nulls.", df, ["Churn"])
    assert_series(df["Churn"].isin(["Yes", "No"]), "Churn contains invalid values.", df, ["Churn"])

    # ---- tenure: numeric + range ----
    # coerce to numeric to catch bad strings
    tenure_num = pd.to_numeric(df["tenure"], errors="coerce")
    assert_series(tenure_num.notna(), "tenure has non-numeric values.", df, ["tenure"])
    assert_series(tenure_num.between(0, 100), "tenure out of expected range [0, 100].", df, ["tenure"])

    # ---- MonthlyCharges: numeric + range ----
    mc = pd.to_numeric(df["MonthlyCharges"], errors="coerce")
    assert_series(mc.notna(), "MonthlyCharges has non-numeric values.", df, ["MonthlyCharges"])
    assert_series(mc.between(0, 500), "MonthlyCharges out of expected range [0, 500].", df, ["MonthlyCharges"])

    # ---- TotalCharges: allow blank -> treat as missing, otherwise must be numeric ----
    tc_raw = df["TotalCharges"].astype(str)
    tc_stripped = tc_raw.str.strip()

    # blank is allowed at validation stage (we'll decide imputation later)
    non_blank = tc_stripped != ""
    tc_num = pd.to_numeric(tc_stripped.where(non_blank, np.nan), errors="coerce")
    assert_series(
        (~non_blank) | tc_num.notna(),
        "TotalCharges contains non-numeric non-blank values (schema drift).",
        df, ["TotalCharges", "tenure", "MonthlyCharges"]
    )

    # ---- Semantic consistency: InternetService == 'No' ----
    mask_no_internet = df["InternetService"] == "No"
    assert_series(
        (~mask_no_internet) | (df["OnlineSecurity"] == "No internet service"),
        "InternetService == 'No' but OnlineSecurity is not 'No internet service'.",
        df, ["InternetService", "OnlineSecurity"]
    )
    assert_series(
        (~mask_no_internet) | (df["OnlineBackup"] == "No internet service"),
        "InternetService == 'No' but OnlineBackup is not 'No internet service'.",
        df, ["InternetService", "OnlineBackup"]
    )

    # ---- Semantic consistency: PhoneService == 'No' ----
    mask_no_phone = df["PhoneService"] == "No"
    assert_series(
        (~mask_no_phone) | (df["MultipleLines"] == "No phone service"),
        "PhoneService == 'No' but MultipleLines is not 'No phone service'.",
        df, ["PhoneService", "MultipleLines"]
    )

    # Optional: ensure expected categorical values haven't exploded wildly
    # (basic drift guardrail; keep it simple)
    assert_true(df["Churn"].nunique() == 2, "Churn unique value count is not 2 (unexpected).")

def run_validation():
    df = pd.read_csv("../data/raw/churn.csv")
    validate_churn_df(df)
    print("✅ Validation passed. (fail-fast)")

run_validation()


✅ Validation passed. (fail-fast)


## Fail policy
If any validation rule fails, the pipeline stops and no downstream feature engineering or model training is triggered.

## Why these rules exist
- Protect against empty/partial upstream ingestion.
- Prevent silent label corruption.
- Enforce numeric ranges based on domain assumptions.
- Detect schema drift in TotalCharges.
- Enforce semantic consistency for service-dependent features.