In [1]:
import pandas as pd
import numpy as np

In [2]:
AUCTION_PATH = r"/content/auction_df.csv"
OUT_CLEAN = r"/content/auction_alpha_clean.csv"
OUT_STATS = r"/content/auction_alpha_stats.csv"

In [3]:
# 1. 숫자형 변환 함수

def to_number(x):
    """
    문자열/콤마/기호가 섞인 가격 데이터를
    float으로 안전하게 변환하는 유틸 함수
    """
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float, np.integer, np.floating)):
        return float(x)
    s = str(x).strip().replace(",", "")
    s = "".join(ch for ch in s if ch.isdigit() or ch in ".-")
    return float(s) if s not in ("", ".", "-", "-.") else np.nan

In [4]:
# 2. 데이터 로드

auc = pd.read_csv(AUCTION_PATH)

# 3. 가격 컬럼 숫자화

PRICE_COLS = ["감정가", "매각가", "최저가"]

for col in PRICE_COLS:
    if col in auc.columns:
        auc[col] = auc[col].apply(to_number)

# 4. 낙찰 케이스만 필터링

sold_mask = auc["진행상태"].astype(str).str.contains("낙찰", na=False)
auc_sold = auc[sold_mask].copy()

# 5. 유효한 가격 데이터만 사용

auc_sold = auc_sold[
    (auc_sold["감정가"] > 0) &
    (auc_sold["매각가"] > 0)
].copy()

In [5]:

# 6. α(회수율) 계산(α = 매각가 / 감정가)
auc_sold["alpha"] = auc_sold["매각가"] / auc_sold["감정가"]

# 7. 이상치 제거 (분위수 기반)
low, high = auc_sold["alpha"].quantile([0.01, 0.99])
auc_sold = auc_sold[
    (auc_sold["alpha"] >= low) &
    (auc_sold["alpha"] <= high)
].copy()

# 8. α 분포 요약 통계량
alpha = auc_sold["alpha"].values

stats = {
    "n_total_rows": int(len(auc)),
    "n_sold_rows": int(len(auc_sold)),
    "alpha_mean": float(np.mean(alpha)),
    "alpha_median": float(np.median(alpha)),
    "alpha_std": float(np.std(alpha, ddof=1)),
    "alpha_p10": float(np.quantile(alpha, 0.10)),  # 보수적 회수율
    "alpha_p25": float(np.quantile(alpha, 0.25)),
    "alpha_p75": float(np.quantile(alpha, 0.75)),
    "alpha_p90": float(np.quantile(alpha, 0.90)),
    "alpha_min": float(np.min(alpha)),
    "alpha_max": float(np.max(alpha)),
}

stats_df = pd.DataFrame([stats])

In [6]:
# 9. 결과 저장
auc_sold.to_csv(OUT_CLEAN, index=False, encoding="utf-8-sig")
stats_df.to_csv(OUT_STATS, index=False, encoding="utf-8-sig")

print("α 요약 통계량")
print(stats_df)

α 요약 통계량
   n_total_rows  n_sold_rows  alpha_mean  alpha_median  alpha_std  alpha_p10  \
0          1017          995    0.754475      0.805248    0.18495   0.587731   

   alpha_p25  alpha_p75  alpha_p90  alpha_min  alpha_max  
0   0.700815   0.851393   0.910079   0.042032   1.048105  
