In [1]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns
from math import radians, sin, cos, sqrt, atan2

# utils
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from lightgbm import LGBMRegressor
import lightgbm as lgb
from pandas.api.types import CategoricalDtype

import eli5
from eli5.sklearn import PermutationImportance
import json

#### Data Load

In [2]:
TRAIN_FE_PATH = "./Data/02_feature_end/train_fe.pkl"
TEST_FE_PATH  = "./Data/02_feature_end/test_fe.pkl"

train_df = pd.read_pickle(TRAIN_FE_PATH)
test_df  = pd.read_pickle(TEST_FE_PATH)

print("train_df shape:", train_df.shape)
print("test_df  shape:", test_df.shape)
print("train_df columns 예시:", list(train_df.columns)[:20])
print(train_df[["is_test","target"]].head())

train_df shape: (1118822, 127)
test_df  shape: (9272, 127)
train_df columns 예시: ['시군구', '번지', '본번', '부번', '아파트명', '전용면적', '계약년월', '계약일', '층', '건축년도', '도로명', '해제사유발생일', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)', '단지소개기존clob', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형']
   is_test   target
0        0  28000.0
1        0  21100.0
2        0  30000.0
3        0  15800.0
4        0  22000.0


In [None]:
TARGET_COL = "target"
NON_FEATURE_COLS = [
    TARGET_COL,
    "is_test",
    # 필요하면 id, index 같은 것도 여기 넣기
]

orig_feature_cols = [c for c in train_df.columns if c not in NON_FEATURE_COLS]
print("전체 사용가능 피처 개수:", len(orig_feature_cols))

# 03 단계에서 저장한 Top60 feature 리스트 로드
TOP_N = 60
TOP_FEATURES_PATH = "./Data/03_model_end/top60_features.json"

with open(TOP_FEATURES_PATH, "r", encoding="utf-8") as f:
    top_features = json.load(f)

print(f"Top{TOP_N} (from 03):", len(top_features))

# 혹시 02/03에서 구조가 변했을 경우를 대비해서 교집합만 사용
selected_features = [f for f in top_features if f in orig_feature_cols]
print("실제로 사용 가능한 feature 수:", len(selected_features))
print("사용 feature 예시:", selected_features[:20])

# 최종 피처 / 타깃
X_full = train_df[selected_features].copy()
y_full = train_df[TARGET_COL].copy()

X_test_full = test_df[selected_features].copy()

print("X_full shape:", X_full.shape)
print("X_test_full shape:", X_test_full.shape)

전체 사용가능 피처 개수: 125
Top60 (from 03): 60
실제로 사용 가능한 feature 수: 60
사용 feature 예시: ['동_평균가격', '전용면적', '계약년월', 'dong_avg_pyp', 'log_area', 'contract_year', '구_평균가격', 'area_pyeong', '건축년도', 'max_floor_in_complex', 'mortgage_ma6', 'same_apt_area_total_trades', 'building_age', 'base_rate', '구_중위가격', '거래유형', 'mortgage_rate', '도로명', 'spread', 'avg_age_by_dong']
X_full shape: (1118822, 60)
X_test_full shape: (9272, 60)


In [4]:
# 범주형(문자열) 컬럼 공통 인코딩 (train + test 같이)
combined = pd.concat([X_full, X_test_full], axis=0, ignore_index=True)

cat_cols = [c for c in selected_features if combined[c].dtype == "object"]
print("범주형 컬럼 수:", len(cat_cols))
print("범주형 컬럼 예시:", cat_cols[:20])

for c in cat_cols:
    combined[c], _ = pd.factorize(combined[c], sort=True)

# 다시 train / test로 분리
X_full_encoded = combined.iloc[: len(X_full)].reset_index(drop=True)
X_test_encoded = combined.iloc[len(X_full):].reset_index(drop=True)

print("X_full_encoded shape:", X_full_encoded.shape)
print("X_test_encoded shape:", X_test_encoded.shape)

범주형 컬럼 수: 12
범주형 컬럼 예시: ['거래유형', '도로명', 'building_age_bucket', '부번', '번지', '시군구', '아파트명', '중개사소재지', '구', '동', '본번', 'area_bucket']
X_full_encoded shape: (1118822, 60)
X_test_encoded shape: (9272, 60)


In [5]:
def make_safe_feature_name(name):
    # 숫자, 영문, 한글, '_' 외 모두 제거 또는 '_'로 치환
    name = re.sub(r"[^\w가-힣]", "_", name)
    
    # 연속된 ___ → _ 로 축소
    name = re.sub(r"_+", "_", name)
    
    # 앞뒤 _ 제거
    return name.strip("_")

# train/test 모두 동일한 rename 적용
rename_map = {col: make_safe_feature_name(col) for col in train_df.columns}

train_df = train_df.rename(columns=rename_map)
test_df  = test_df.rename(columns=rename_map)

In [None]:
# Local Validation (optional)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_full_encoded,
    y_full,
    test_size=0.2,
    random_state=42
)

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)

# best_params 그대로 가져오기
best_params = {
    "learning_rate": 0.030710573677773714,
    "num_leaves": 244,
    "max_depth": 12,
    "min_data_in_leaf": 188,
    "feature_fraction": 0.6624074561769746,
    "bagging_fraction": 0.662397808134481,
    "bagging_freq": 0,
    "lambda_l1": 4.330880728874676,
    "lambda_l2": 3.005575058716044,
}

base_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "random_state": 42,
    "num_threads": -1,
}
final_params = {**base_params, **best_params}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

print("\n[Local Validation용 LGBM 학습 시작]")
model_cv = lgb.train(
    final_params,
    train_data,
    num_boost_round=2000,
    valid_sets=[valid_data],
    valid_names=["valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(50),
    ],
)

y_valid_pred = model_cv.predict(X_valid, num_iteration=model_cv.best_iteration)
rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
print(f"\n[Local Validation RMSE]: {rmse:.4f}")
print("Best iteration:", model_cv.best_iteration)

X_train: (895057, 60) X_valid: (223765, 60)

[Local Validation용 LGBM 학습 시작]
Training until validation scores don't improve for 100 rounds
[50]	valid's rmse: 16473.6
[100]	valid's rmse: 10695.1
[150]	valid's rmse: 9143.88
[200]	valid's rmse: 8493.7
[250]	valid's rmse: 8113.42
[300]	valid's rmse: 7825.39
[350]	valid's rmse: 7586.56
[400]	valid's rmse: 7413.49
[450]	valid's rmse: 7260.98
[500]	valid's rmse: 7136.19
[550]	valid's rmse: 7008.65
[600]	valid's rmse: 6908.71
[650]	valid's rmse: 6821.69
[700]	valid's rmse: 6750.55
[750]	valid's rmse: 6683.63
[800]	valid's rmse: 6615.34
[850]	valid's rmse: 6567.71
[900]	valid's rmse: 6505.73
[950]	valid's rmse: 6456.3
[1000]	valid's rmse: 6406.12
[1050]	valid's rmse: 6359.79
[1100]	valid's rmse: 6317.07
[1150]	valid's rmse: 6286.57
[1200]	valid's rmse: 6259.34
[1250]	valid's rmse: 6225.24
[1300]	valid's rmse: 6195.08
[1350]	valid's rmse: 6163.05
[1400]	valid's rmse: 6143.01
[1450]	valid's rmse: 6121.43
[1500]	valid's rmse: 6097.42
[1550]	valid's

In [7]:
# Full train으로 최종 모델 학습
train_data_full = lgb.Dataset(X_full_encoded, label=y_full)

print("\n[최종 제출용 모델 학습 시작]")
final_model = lgb.train(
    final_params,
    train_data_full,
    num_boost_round=model_cv.best_iteration if 'model_cv' in globals() else 2000,
    valid_sets=[train_data_full],
    valid_names=["train"],
    callbacks=[
        lgb.log_evaluation(200),
    ],
)

# 나중에 재사용할 수 있게 저장 (옵션)
OUTPUT_DIR = "./Data/04_final_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MODEL_PATH = os.path.join(OUTPUT_DIR, "lgbm_final_model.pkl")
import pickle
with open(MODEL_PATH, "wb") as f:
    pickle.dump(final_model, f)

print("✅ 최종 모델 저장 완료:", MODEL_PATH)


[최종 제출용 모델 학습 시작]
[200]	train's rmse: 7879.12
[400]	train's rmse: 6689.1
[600]	train's rmse: 6127.49
[800]	train's rmse: 5801.92
[1000]	train's rmse: 5565.55
[1200]	train's rmse: 5377.19
[1400]	train's rmse: 5219.81
[1600]	train's rmse: 5096.55
[1800]	train's rmse: 4994.54
[2000]	train's rmse: 4897.29
✅ 최종 모델 저장 완료: ./Data/04_final_model/lgbm_final_model.pkl


In [10]:
# sample_submission 불러와서 제출 파일 생성
SUBMISSION_DIR = "./submission"
os.makedirs(SUBMISSION_DIR, exist_ok=True)

SAMPLE_SUB_PATH = "./Data/sample_submission.csv"
sample = pd.read_csv(SAMPLE_SUB_PATH)

print("sample_submission shape:", sample.shape)
print(sample.head())

# test 예측
test_pred = final_model.predict(X_test_encoded, num_iteration=final_model.best_iteration)
test_pred_int = np.round(test_pred).astype(int)

if "target" in sample.columns:
    sample["target"] = test_pred_int
else:
    print("⚠ sample_submission에 'target' 컬럼이 없습니다.")
    print("현재 sample columns:", sample.columns)

SUBMIT_PATH = os.path.join(SUBMISSION_DIR, "submission_lgbm_top60.csv")
sample.to_csv(SUBMIT_PATH, index=False)
print("✅ 최종 제출 파일 저장 완료:", SUBMIT_PATH)

sample_submission shape: (9272, 1)
   target
0  179048
1   84820
2  248141
3  180991
4  295430
✅ 최종 제출 파일 저장 완료: ./submission/submission_lgbm_top60.csv
