<a href="https://colab.research.google.com/github/veryHapppy/study_ai/blob/main/Kaggle/house_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install optuna
#!pip install catboost
from urllib.request import urlopen
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import optuna
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import VotingClassifier
import seaborn as sns
from sklearn.model_selection import learning_curve
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import Ridge  # 스태킹 최종 모델은 보통 Ridge를 많이 씁니다

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/house_prices_train.csv')

data = data.drop(data[(data['GrLivArea']>4000) & (data['SalePrice']<300000)].index)
data = data.drop(data[(data['OverallCond']==2) & (data['SalePrice']>300000)].index)


In [None]:
def feature_engineering(data, neighborhood_mapping, ordinal_mapping):
    data['TotalFinishedSF'] = data['GrLivArea'] + data['TotalBsmtSF']
    data['TotalFinishedSF'] = np.log1p(data['TotalFinishedSF'])
    data['FinalYear'] = np.maximum(data['YearBuilt'], data['YearRemodAdd'])
    data['Decade'] = (data['FinalYear'] // 10) * 10

    # [수정] 외부에서 만든 mapping을 사용하여 SalePrice가 없어도 작동하게 함
    data['shape'] = data['MSSubClass'].astype(str) + "_" + data['MSZoning'].astype(str)
    data['SubClass_Zoning_Rank'] = data['shape'].map(ordinal_mapping).fillna(0)

    data['Neighborhood_Rank'] = data['Neighborhood'].map(neighborhood_mapping).fillna(13)

    data['GarageCars'] = data['GarageCars'].fillna(0)
    data['TotalBath'] = (data['FullBath'] +
                         (0.5 * data['HalfBath'].fillna(0)) +
                         data['BsmtFullBath'].fillna(0) +
                         (0.5 * data['BsmtHalfBath'].fillna(0)))
    return data

In [None]:
# Neighborhood 순서 정하기
neighborhood_order = data.groupby('Neighborhood')['SalePrice'].median().sort_values().index
neighborhood_mapping = {label: i for i, label in enumerate(neighborhood_order, 1)}

# SubClass_Zoning 순서 정하기
data['shape'] = data['MSSubClass'].astype(str) + "_" + data['MSZoning'].astype(str)
ordered_labels = data.groupby(['shape'])['SalePrice'].median().sort_values().index
ordinal_mapping = {label: i for i, label in enumerate(ordered_labels, 1)}

feature_engineering(data,neighborhood_mapping,ordinal_mapping)

features = ['OverallQual', 'LotArea', 'OverallCond', 'TotalFinishedSF', 'Decade', 'SubClass_Zoning_Rank', 'Neighborhood_Rank', 'GarageCars', 'TotalBath', 'Fireplaces']

x = data[features]
y = data['SalePrice']


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # 빈칸은 중앙값으로
    ('scaler', StandardScaler())                   # 표준화 스케일링
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, features)
    ])

X_train_transformed = preprocessor.fit_transform(X_train)
y_train_log = np.log1p(y_train)

def objective(trial):
    # 1. 하이퍼파라미터 설정 (유지)
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n', 100, 500),
        'max_depth': trial.suggest_int('xgb_depth', 3, 7),
        'colsample_bytree': trial.suggest_float('xgb_colsample', 0.6, 0.8),
        'reg_alpha': trial.suggest_float('xgb_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('xgb_lambda', 1e-3, 10.0, log=True),
        'learning_rate': trial.suggest_float('xgb_lr', 0.01, 0.1),
    }

    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n', 100, 500),
        'num_leaves': trial.suggest_int('lgbm_leaves', 15, 50),
        'learning_rate': trial.suggest_float('lgbm_lr', 0.01, 0.1),
        'verbose': -1
    }

    cat_params = {
        'iterations': trial.suggest_int('cat_iter', 100, 500),
        'depth': trial.suggest_int('cat_depth', 3, 7),
        'learning_rate': trial.suggest_float('cat_lr', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
        'logging_level': 'Silent' # CatBoost 조용히 시키기
    }

    w_xgb = trial.suggest_float('w_xgb', 0.5, 2.0)
    w_lgbm = trial.suggest_float('w_lgbm', 0.5, 2.0)
    w_cat = trial.suggest_float('w_cat', 0.5, 2.0)

    # 2. 모델 객체 생성 (Regressor로 변경!)
    xgb = XGBRegressor(**xgb_params, n_jobs=-1, random_state=42)
    lgbm = LGBMRegressor(**lgbm_params, n_jobs=-1, random_state=42)
    cat = CatBoostRegressor(**cat_params, thread_count=-1, random_state=42)

    # 3. 앙상블 모델 구축 (VotingRegressor로 변경!)
    voting_model = VotingRegressor(
        estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
        weights=[w_xgb, w_lgbm, w_cat]
    )

    # 4. 교차 검증 (회귀이므로 r2 스코어 사용)
    scores = cross_val_score(voting_model, X_train_transformed, y_train_log, cv=5, scoring='r2')
    return scores.mean()

# 6. 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=70)

print(f"최고 R2 점수: {study.best_value:.4f}")

[I 2025-12-21 07:20:14,887] A new study created in memory with name: no-name-ccdff4af-c432-4412-b00b-7aa50be8ed2b
[I 2025-12-21 07:20:18,076] Trial 0 finished with value: 0.8930027426224371 and parameters: {'xgb_n': 424, 'xgb_depth': 5, 'xgb_colsample': 0.7185818629687888, 'xgb_alpha': 1.540840376776914, 'xgb_lambda': 0.002205994303275656, 'xgb_lr': 0.08485011111639221, 'lgbm_n': 342, 'lgbm_leaves': 29, 'lgbm_lr': 0.07203974701618351, 'cat_iter': 191, 'cat_depth': 4, 'cat_lr': 0.08982842104355808, 'l2_leaf_reg': 6.337757879265702, 'random_strength': 3.0036898045831717, 'w_xgb': 0.8420579984496273, 'w_lgbm': 1.2418475800907713, 'w_cat': 1.1202986863776805}. Best is trial 0 with value: 0.8930027426224371.
[I 2025-12-21 07:20:24,065] Trial 1 finished with value: 0.8903390600713841 and parameters: {'xgb_n': 182, 'xgb_depth': 6, 'xgb_colsample': 0.7754990869704212, 'xgb_alpha': 0.004228083307055865, 'xgb_lambda': 0.009680039815364895, 'xgb_lr': 0.06964937619102345, 'lgbm_n': 282, 'lgbm_leav

최고 R2 점수: 0.9003


In [None]:


best = study.best_params

# 1. 개별 모델 설정 (Regressor로 변경)
xgb_final = XGBRegressor(
    n_estimators=best['xgb_n'],
    max_depth=best['xgb_depth'],
    learning_rate=best['xgb_lr'],
    colsample_bytree=best['xgb_colsample'],
    reg_alpha=best['xgb_alpha'],
    reg_lambda=best['xgb_lambda'],
    random_state=42,
)

lgbm_final = LGBMRegressor(
    n_estimators=best['lgbm_n'],
    num_leaves=best['lgbm_leaves'],
    learning_rate=best['lgbm_lr'],
    random_state=42,
    verbose=-1
)

cat_final = CatBoostRegressor(
    iterations=best['cat_iter'],
    depth=best['cat_depth'],
    learning_rate=best['cat_lr'],
    l2_leaf_reg=best['l2_leaf_reg'],
    random_strength=best['random_strength'],
    random_state=42,
    logging_level='Silent'
)

# 2. Voting & Stacking 구성 (Regressor로 변경)
voting_model = VotingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    weights=[best['w_xgb'], best['w_lgbm'], best['w_cat']]
)

stacking_model = StackingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    final_estimator=Ridge(), # 회귀에서는 LogisticRegression 대신 Ridge나 Lasso를 씁니다
    cv=5
)
X_test_real_transformed = preprocessor.fit_transform(X_test)
# 3. 모델 학습 (이미 전처리된 X_train_transformed를 바로 사용)
# 주의: 파이프라인으로 묶어서 학습하려면 X_train(원본)을 넣어야 하지만,
# 속도를 위해 이미 변환된 데이터를 쓰기로 했으니 모델만 직접 fit 합니다.
voting_model.fit(X_train_transformed, y_train_log)
stacking_model.fit(X_train_transformed, y_train_log)

# 4. 테스트 데이터 예측 (X_test_real_transformed는 이미 preprocessor를 통과한 상태여야 함)
log_predictions1 = voting_model.predict(X_test_real_transformed)
log_predictions2 = stacking_model.predict(X_test_real_transformed)

# 5. 실제 가격으로 복원
final_predictions1 = np.expm1(log_predictions1)
final_predictions2 = np.expm1(log_predictions2)

# 6. 검증 점수 확인 (y_test도 로그 변환 상태와 비교해야 정확함)
y_test_log = np.log1p(y_test)
print(f"voting 검증 R2: {voting_model.score(preprocessor.transform(X_test), y_test_log):.4f}")
print(f"stacking 검증 R2: {stacking_model.score(preprocessor.transform(X_test), y_test_log):.4f}")

# 7. 모델 저장 (전처리기와 모델을 합쳐서 저장하는 것이 나중에 쓰기 편합니다)
final_voting_pipeline = Pipeline([('pre', preprocessor), ('model', voting_model)])
final_stacking_pipeline = Pipeline([('pre', preprocessor), ('model', stacking_model)])

joblib.dump(final_voting_pipeline, 'house_price_voting_best.pkl')
joblib.dump(final_stacking_pipeline, 'house_price_stacking_best.pkl')
print("저장되었습니다")

voting 검증 R2: 0.9134
stacking 검증 R2: 0.9130
저장되었습니다


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/house_prices_test.csv')
neighborhood_order = data.groupby('Neighborhood')['SalePrice'].median().sort_values().index
neighborhood_mapping = {label: i for i, label in enumerate(neighborhood_order, 1)}
data['shape'] = data['MSSubClass'].astype(str) + "_" + data['MSZoning'].astype(str)
ordered_labels = data.groupby(['shape'])['SalePrice'].median().sort_values().index
ordinal_mapping = {label: i for i, label in enumerate(ordered_labels, 1)}

test_df = feature_engineering(test_df, neighborhood_mapping, ordinal_mapping)
# 2. 최적의 모델 로드
loaded_model = joblib.load('house_price_voting_best.pkl')

# 3. 예측 수행
# 이제 test_df 안에 필요한 컬럼들이 다 들어있으므로 에러가 나지 않습니다.
log_predictions = loaded_model.predict(test_df)

# 4. 로그 복원
final_predictions = np.expm1(log_predictions)

# 5. 제출 형식 생성
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": final_predictions
})

# 6. CSV 저장
submission.to_csv('submission_house_price.csv', index=False)
print("집값 예측 최종 제출 파일이 생성되었습니다.")

집값 예측 최종 제출 파일이 생성되었습니다.
