# 수원서베이 2024 라벨링 데이터 자동 EDA

- 변수 유형 분류, 결측 분석, 기본 시각화, 결과 저장을 자동화합니다.

In [9]:

from pathlib import Path
import re
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

# Set Korean font for Windows
plt.rcParams['font.family'] = 'Malgun Gothic'  # or 'NanumGothic' if installed
plt.rcParams['axes.unicode_minus'] = False

BASE = Path.cwd().parent
DATA_CSV = BASE / "output" / "1. 수원서베이" / "suwon_2024_labeled.csv"
DATA_XLSX = BASE / "output" / "1. 수원서베이" / "suwon_2024_labeled.xlsx"
CODEBOOK = BASE / "data" / "internal" / "1. 수원서베이" / "(HRC250604) 2024년 수원서베이 용역_공개용 데이터" / "(HRC250604) 2024년 수원서베이 용역_공개용 데이터_코드북.xlsx"

OUTDIR = BASE / "eda"
CHARTS = OUTDIR / "charts"
TABLES = OUTDIR / "tables"
LOGS = OUTDIR / "logs"
for d in (OUTDIR, CHARTS, TABLES, LOGS):
    d.mkdir(parents=True, exist_ok=True)

plt.rcParams['axes.unicode_minus'] = False

def now():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")


In [10]:

if DATA_XLSX.exists():
    df = pd.read_excel(DATA_XLSX, sheet_name="data_labeled")
elif DATA_CSV.exists():
    df = pd.read_csv(DATA_CSV)
else:
    raise FileNotFoundError("라벨링 데이터 파일이 없습니다. suwon_2024_labeled.xlsx 또는 CSV를 확인하세요.")

print("Loaded shape:", df.shape)
df.head(3)


Loaded shape: (3057, 481)


Unnamed: 0,PID,H0a1,H0a2,H0a3,gender,birth,H0a4,H0a5n1,H0a5opn1,H0a9n1,...,SCORE1,SCORE2,MQ1,MQ2,MQ4,MHQ1,MHQ2,MHQ4,wg,ws
0,29715,아파트,1인 가구,해당 없음,1,1992,1,본인,,남,...,보통,불만족,4.0,5.6,4.6,50.0,56,60.0,241.0,0.729643
1,51295,아파트,다인 가구,맞벌이,2,1981,4,본인의 배우자,,남,...,보통,만족,4.857143,8.0,5.3,64.285714,80,71.666667,202.384615,0.612732
2,52982,아파트,다인 가구,외벌이,1,1977,5,본인,,남,...,보통,만족,4.571429,5.6,5.1,59.52381,56,68.333333,212.0,0.641843


In [11]:

LIKERT_KEYWORDS = [
    "전혀", "그렇지 않다", "보통", "그렇다", "매우", "만족", "불만족", "동의", "비동의",
    "낮다", "높다", "나쁘다", "좋다", "의견", "정도", "점수", "만큼"
]

def looks_like_likert(series, sample_k=30):
    # 리커트 의심 여부 간단 판단
    vals = series.dropna().astype(str).unique()[:sample_k]
    hit = 0
    for v in vals:
        if any(k in v for k in LIKERT_KEYWORDS):
            hit += 1
    nunique = series.nunique(dropna=True)
    return (4 <= nunique <= 7) and (hit >= max(1, int(np.ceil(len(vals) * 0.2))))

def infer_var_type(s, cat_threshold=0.05, max_cat_unique=30):
    # 숫자 비율이 매우 높으면 numeric
    s_num = pd.to_numeric(s, errors="coerce")
    numeric_ratio = s_num.notna().mean()
    nunique = s.nunique(dropna=True)

    if numeric_ratio > 0.98:
        return "numeric"
    if looks_like_likert(s):
        return "ordinal_likert"
    if nunique <= max_cat_unique or nunique / max(1, len(s)) <= cat_threshold:
        return "categorical"
    return "categorical"

def summarize_numeric(s):
    s_num = pd.to_numeric(s, errors="coerce")
    return {
        "count": int(s_num.count()),
        "mean": float(s_num.mean()) if s_num.count() else np.nan,
        "std": float(s_num.std()) if s_num.count() else np.nan,
        "min": float(s_num.min()) if s_num.count() else np.nan,
        "q25": float(s_num.quantile(0.25)) if s_num.count() else np.nan,
        "median": float(s_num.median()) if s_num.count() else np.nan,
        "q75": float(s_num.quantile(0.75)) if s_num.count() else np.nan,
        "max": float(s_num.max()) if s_num.count() else np.nan,
        "nunique": int(s_num.nunique(dropna=True)),
        "na_rate": float(s.isna().mean())
    }

def summarize_categorical(s, topk=30):
    vc = s.fillna("(결측)").astype(str).value_counts(dropna=False)
    freq = (vc / len(s)).reset_index()
    freq.columns = ["value", "ratio"]
    freq["count"] = (vc.reset_index(drop=True)).values
    return freq.head(topk)

def plot_numeric_hist(s, title, outpath):
    plt.figure()
    s_num = pd.to_numeric(s, errors="coerce").dropna()
    plt.hist(s_num, bins=30)
    plt.title(title)
    plt.xlabel("value")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(outpath)
    plt.close()

def plot_numeric_box(s, title, outpath):
    plt.figure()
    s_num = pd.to_numeric(s, errors="coerce").dropna()
    plt.boxplot(s_num, vert=True)
    plt.title(title)
    plt.ylabel("value")
    plt.tight_layout()
    plt.savefig(outpath)
    plt.close()

def plot_categorical_bar(s, title, outpath):
    plt.figure()
    vc = s.fillna("(결측)").astype(str).value_counts()
    plt.bar(vc.index[:30], vc.values[:30])
    plt.title(title)
    plt.xticks(rotation=75, ha="right")
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(outpath)
    plt.close()


In [12]:

N_CHARTS_PER_TYPE = 30
summary_rows = []
cat_targets, num_targets, likert_targets = [], [], []

for col in df.columns:
    s = df[col]
    vtype = infer_var_type(s)
    na_rate = float(s.isna().mean())
    nunique = int(s.nunique(dropna=True))

    info = {
        "column": col,
        "type": vtype,
        "nunique": nunique,
        "na_rate": na_rate
    }

    if vtype == "numeric":
        stats = summarize_numeric(s)
        info.update(stats)
        num_targets.append(col)
    elif vtype == "ordinal_likert":
        freq = summarize_categorical(s, topk=50)
        freq.to_csv((OUTDIR / "tables" / f"{col}_freq.csv"), index=False, encoding="utf-8-sig")
        likert_targets.append(col)
    else:
        freq = summarize_categorical(s, topk=50)
        freq.to_csv((OUTDIR / "tables" / f"{col}_freq.csv"), index=False, encoding="utf-8-sig")
        cat_targets.append(col)

    summary_rows.append(info)

profile_df = pd.DataFrame(summary_rows).sort_values(["type","na_rate","nunique"], ascending=[True, True, False])
profile_path = OUTDIR / "tables" / "variables_profile.csv"
profile_df.to_csv(profile_path, index=False, encoding="utf-8-sig")

print("Profiling complete. Saved to:", profile_path)
profile_df.head(10)


Profiling complete. Saved to: d:\workspace\dacon_sri\eda\tables\variables_profile.csv


Unnamed: 0,column,type,nunique,na_rate,count,mean,std,min,q25,median,q75,max
449,DM32,categorical,44,0.0,,,,,,,,
270,Q25m1,categorical,18,0.0,,,,,,,,
117,H10r1,categorical,13,0.0,,,,,,,,
118,H10r2,categorical,13,0.0,,,,,,,,
119,H10r3,categorical,13,0.0,,,,,,,,
112,H8,categorical,12,0.0,,,,,,,,
461,DM12,categorical,12,0.0,,,,,,,,
464,DM13,categorical,12,0.0,,,,,,,,
132,Q2a1,categorical,11,0.0,,,,,,,,
133,Q2a2,categorical,11,0.0,,,,,,,,


In [13]:
CHARTS = OUTDIR / "charts"
LOGS = OUTDIR / "logs"
(CHARTS / "numeric").mkdir(parents=True, exist_ok=True)
(CHARTS / "categorical").mkdir(parents=True, exist_ok=True)
(CHARTS / "likert").mkdir(parents=True, exist_ok=True)

def pick_top(df_profile, kind, n=30):
    sub = df_profile[df_profile["type"] == kind].copy()
    if sub.empty:
        return []
    sub = sub.sort_values(["na_rate","nunique"], ascending=[True, True])
    return list(sub["column"].head(n))

top_nums = pick_top(profile_df, "numeric", N_CHARTS_PER_TYPE)
top_cats = pick_top(profile_df, "categorical", N_CHARTS_PER_TYPE)
top_likerts = pick_top(profile_df, "ordinal_likert", N_CHARTS_PER_TYPE)

for c in top_nums:
    try:
        plot_numeric_hist(df[c], f"{c} (numeric)", CHARTS / "numeric" / f"{c}_hist.png")
        plot_numeric_box(df[c], f"{c} (numeric box)", CHARTS / "numeric" / f"{c}_box.png")
    except Exception as e:
        with open(LOGS / "chart_errors.txt", "a", encoding="utf-8") as f:
            f.write(f"NUMERIC {c}: {e}\n")

for c in top_cats:
    try:
        plot_categorical_bar(df[c], f"{c} (categorical)", CHARTS / "categorical" / f"{c}_bar.png")
    except Exception as e:
        with open(LOGS / "chart_errors.txt", "a", encoding="utf-8") as f:
            f.write(f"CATEG {c}: {e}\n")

for c in top_likerts:
    try:
        plot_categorical_bar(df[c], f"{c} (likert)", CHARTS / "likert" / f"{c}_bar.png")
    except Exception as e:
        with open(LOGS / "chart_errors.txt", "a", encoding="utf-8") as f:
            f.write(f"LIKERT {c}: {e}\n")

len(top_nums), len(top_cats), len(top_likerts)


(25, 30, 30)

In [14]:

missing = df.isna().mean().sort_values(ascending=False).reset_index()
missing.columns = ["column", "na_rate"]
missing["na_rate"] = missing["na_rate"].round(4)
missing_path = OUTDIR / "tables" / "missing_rates.csv"
missing.to_csv(missing_path, index=False, encoding="utf-8-sig")
missing.head(20)


Unnamed: 0,column,na_rate
0,H0a5opn7,1.0
1,H0a5opn6,1.0
2,H0a5opn5,1.0
3,H0a5opn4,1.0
4,H0a5opn3,1.0
5,H1_1a6n1,1.0
6,H1_1a6n6,1.0
7,H1_1a5n6,1.0
8,H1_1a6n2,1.0
9,H1_1a5n1,1.0
