# 50_categorical_nominal — 범주형-명목형(메타/eligibility 반영)

In [None]:

%run 00_config.ipynb
import pandas as pd, numpy as np

df = pd.read_excel(DATA_PATH)
w = choose_weight(df)

nom_list = var_types["categorical-nominal"]["questions"]
detail_keys = list(categorical_nominal_detail.keys())
effective_nominals = [c for c in nom_list if c in detail_keys and c in df.columns]

def wprop(series, w):
    s = series.dropna()
    if s.empty: return pd.Series(dtype=float)
    w = w.loc[s.index]
    tot = w.sum()
    if tot<=0: return pd.Series(dtype=float)
    return (w.groupby(s).sum()/tot*100).sort_index()

rows = []
dummies_list = []

for col in effective_nominals:
    meta = categorical_nominal_detail[col]
    elig_cfg = meta.get("eligibility")
    eligible = apply_eligibility(df, elig_cfg["by"], elig_cfg) if elig_cfg else pd.Series(True, index=df.index)
    df[f"{col}_eligible"] = eligible
    s = df.loc[eligible, col]

    # 분포
    w_sub = w.loc[s.index]
    ws = wprop(s, w_sub)
    for lvl, pct in ws.items():
        rows.append({"variable": col, "level": lvl, "weighted_%": round(float(pct),2)})
    us = (s.value_counts(normalize=True)*100).round(2)
    for lvl, pct in us.items():
        rows.append({"variable": col, "level": lvl, "unweighted_%": float(pct)})

    # 원핫(eligible만)
    if meta.get("encode") == "onehot":
        dmy = pd.get_dummies(s, prefix=col, dummy_na=True)
        dummies_list.append(dmy)

nom_summary = pd.DataFrame(rows).groupby(["variable","level"], as_index=False).sum(numeric_only=True)\
                                .sort_values(["variable","level"])
nom_dummies = pd.concat(dummies_list, axis=1) if dummies_list else pd.DataFrame(index=df.index)

nom_summary.to_csv(OUT_DIR/"categorical_nominal_summary_meta.csv", index=False, encoding="utf-8-sig")
nom_dummies.to_csv(OUT_DIR/"categorical_nominal_dummies_meta.csv", index=False, encoding="utf-8-sig")

# 예시 교차표: GU×q3, GU×q7
def cross_and_v(df, a, b):
    ct = pd.crosstab(df[a], df[b])
    obs = ct.values; n = obs.sum()
    if n == 0: return ct, np.nan
    exp = np.outer(obs.sum(1), obs.sum(0)) / n
    with np.errstate(divide='ignore', invalid='ignore'):
        chi2 = np.nansum((obs-exp)**2/exp)
    r, k = obs.shape
    v = np.sqrt(chi2 / (n * (min(r-1, k-1)))) if min(r-1,k-1)>0 else np.nan
    return ct, v

if "GU" in effective_nominals and "q3" in df.columns:
    m = df["GU"].notna() & df["q3"].notna()
    ct, v = cross_and_v(df.loc[m, :], "GU", "q3")
    ct.to_csv(OUT_DIR/"GU_x_q3_crosstab.csv", encoding="utf-8-sig")
    with open(OUT_DIR/"GU_x_q3_cramers_v.txt","w",encoding="utf-8") as f:
        f.write(str(v))

if "GU" in effective_nominals and "q7" in df.columns:
    m = df["GU"].notna() & df["q7"].notna()
    ct, v = cross_and_v(df.loc[m, :], "GU", "q7")
    ct.to_csv(OUT_DIR/"GU_x_q7_crosstab.csv", encoding="utf-8-sig")
    with open(OUT_DIR/"GU_x_q7_cramers_v.txt","w",encoding="utf-8") as f:
        f.write(str(v))

nom_summary.head(10)
