<a href="https://colab.research.google.com/github/sun9huni/first-repository/blob/main/%EA%B9%80%EC%84%B1%ED%9B%88_%ED%9A%8C%EA%B7%80_%EC%A7%91%EA%B0%92%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# 20255.43899

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import skew, uniform, randint
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 데이터 전처리 함수
def preprocess_data(df):
    df.drop(['Id','PoolQC' , 'MiscFeature', 'Alley', 'Fence','FireplaceQu'], axis=1 , inplace=True)
    num_cols = df.select_dtypes(include=['number']).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].mean())
    features_index = df.dtypes[df.dtypes != 'object'].index
    skew_features = df[features_index].apply(lambda x : skew(x))
    skew_features_top = skew_features[skew_features > 1]
    df[skew_features_top.index] = np.log1p(df[skew_features_top.index])
    df_ohe = pd.get_dummies(df)
    return df_ohe, skew_features_top.index

# RMSE 평가 함수
def get_rmse(model, X_test, y_test):
    pred = model.predict(X_test)
    pred_exp = np.expm1(pred)
    mse = mean_squared_error(np.expm1(y_test), pred_exp)
    rmse = np.sqrt(mse)
    print('{0} 로그 변환된 RMSE: {1}'.format(model.__class__.__name__,np.round(rmse, 3)))
    return rmse

# RandomizedSearchCV 기반 하이퍼파라미터 튜닝
def get_best_model_random(model, params, X_features, y_target):
    random_model = RandomizedSearchCV(
        model, param_distributions=params, n_iter=20,
        scoring='neg_mean_squared_error', cv=5, random_state=42
    )
    random_model.fit(X_features, y_target)
    rmse = np.sqrt(-1 * random_model.best_score_)
    print('{0} 5 CV 최적 평균 RMSE: {1:.4f}, 최적 파라미터: {2}'.format(
        model.__class__.__name__, rmse, random_model.best_params_))
    return random_model.best_estimator_

# 스태킹 기반 데이터 생성 함수 (shuffle=True, random_state=42)
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        X_tr, y_tr = X_train_n[train_index], y_train_n[train_index]
        X_te = X_train_n[valid_index]
        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    return train_fold_pred, test_pred_mean

# 블렌딩(단순 평균) 예측 함수
def blending_predict(models, X):
    preds = [model.predict(X) for model in models]
    blended_pred = np.mean(preds, axis=0)
    return blended_pred

# 데이터 로드 및 전처리
house_df_org = pd.read_csv('train.csv')
house_df = house_df_org.copy()
house_df_ohe, skew_features_top_index = preprocess_data(house_df)

# 이상치 제거
cond1 = house_df_ohe['GrLivArea'] > np.log1p(4000)
cond2 = house_df_ohe['SalePrice'] < np.log1p(500000)
outlier_index = house_df_ohe[cond1 & cond2].index
house_df_ohe.drop(outlier_index , axis=0, inplace=True)

# 학습 데이터 분리
y_target = house_df_ohe['SalePrice']
X_features = house_df_ohe.drop('SalePrice',axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)

# 하이퍼파라미터 탐색 공간 정의
ridge_params = {
    'alpha': uniform(0.01, 20),
    'fit_intercept': [True, False],
    'max_iter': randint(1000, 5000)
}
lasso_params = {
    'alpha': uniform(0.0001, 10),
    'fit_intercept': [True, False],
    'max_iter': randint(1000, 5000)
}
xgb_params = {
    'n_estimators': randint(500, 2000),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(2, 8)
}
lgbm_params = {
    'n_estimators': randint(500, 2000),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(2, 8)
}

print("하이퍼파라미터 튜닝 시작...")
best_ridge = get_best_model_random(Ridge(), ridge_params, X_features, y_target)
best_lasso = get_best_model_random(Lasso(), lasso_params, X_features, y_target)
best_xgb = get_best_model_random(XGBRegressor(random_state=156), xgb_params, X_features, y_target)
best_lgbm = get_best_model_random(LGBMRegressor(random_state=156), lgbm_params, X_features, y_target)
print("하이퍼파라미터 튜닝 완료.")

# 최적 파라미터로 모델 정의
ridge_reg = Ridge(**best_ridge.get_params())
lasso_reg = Lasso(**best_lasso.get_params())
xgb_reg = XGBRegressor(**best_xgb.get_params())
lgbm_reg = LGBMRegressor(**best_lgbm.get_params())

print("\n개별 모델 학습 시작...")
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)
lgbm_reg.fit(X_train, y_train)
print("개별 모델 학습 완료.")

# 개별 모델 RMSE 평가
models = [ridge_reg, lasso_reg, xgb_reg, lgbm_reg]
print("\n개별 모델 RMSE:")
for model in models:
    get_rmse(model, X_test, y_test)

# 블렌딩(단순 평균) RMSE 평가
blended_pred = blending_predict(models, X_test)
blended_rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(blended_pred)))
print(f'블렌딩(단순 평균) RMSE: {blended_rmse:.3f}')

# 스태킹 기반 데이터 생성
X_train_n = X_train.values
X_test_n = X_test.values
y_train_n = y_train.values

print("\n스태킹 기반 데이터 생성 시작...")
ridge_train, ridge_test = get_stacking_base_datasets(ridge_reg, X_train_n, y_train_n, X_test_n, 5)
lasso_train, lasso_test = get_stacking_base_datasets(lasso_reg, X_train_n, y_train_n, X_test_n, 5)
xgb_train, xgb_test = get_stacking_base_datasets(xgb_reg, X_train_n, y_train_n, X_test_n, 5)
lgbm_train, lgbm_test = get_stacking_base_datasets(lgbm_reg, X_train_n, y_train_n, X_test_n, 5)
print("스태킹 기반 데이터 생성 완료.")

Stack_final_X_train = np.concatenate((ridge_train, lasso_train, xgb_train, lgbm_train), axis=1)
Stack_final_X_test = np.concatenate((ridge_test, lasso_test, xgb_test, lgbm_test), axis=1)

# 최종 메타 모델(Lasso) 정의 및 학습
meta_model_lasso = Lasso(alpha=0.0005)
print("\n메타 모델 학습 시작...")
meta_model_lasso.fit(Stack_final_X_train, y_train)
print("메타 모델 학습 완료.")
final_pred_stacking = meta_model_lasso.predict(Stack_final_X_test)

# 스태킹 회귀 모델 RMSE 평가
mse_stacking = mean_squared_error(y_test , final_pred_stacking)
rmse_stacking = np.sqrt(mse_stacking)
print('\n스태킹 회귀 모델의 최종 RMSE 값은:', rmse_stacking)

# 제출용 Test 데이터 전처리
test_df = pd.read_csv('./test.csv')
test_df.drop(['Id','PoolQC' , 'MiscFeature', 'Alley', 'Fence','FireplaceQu'], axis=1 , inplace=True)
num_cols_test = test_df.select_dtypes(include=['number']).columns
test_df[num_cols_test] = test_df[num_cols_test].fillna(test_df[num_cols_test].mean())
skew_cols_for_test = [col for col in skew_features_top_index if col != 'SalePrice']
test_df[skew_cols_for_test] = np.log1p(test_df[skew_cols_for_test])
test_ohe1 = pd.get_dummies(test_df)
test_ohe_processed = test_ohe1.reindex(columns=X_features.columns, fill_value=0)
print("\n테스트 데이터 전처리 완료. 전처리 후 테스트 데이터 shape:", test_ohe_processed.shape)

# 전체 학습 데이터로 기반모델 학습 및 테스트 데이터 예측
X_features_np = X_features.values
y_target_np = y_target.values
test_ohe_processed_np = test_ohe_processed.values

print("\n스태킹 모델 테스트 데이터 예측 시작...")
ridge_test_pred_for_submission = get_stacking_base_datasets(ridge_reg, X_features_np, y_target_np, test_ohe_processed_np, 5)[1]
lasso_test_pred_for_submission = get_stacking_base_datasets(lasso_reg, X_features_np, y_target_np, test_ohe_processed_np, 5)[1]
xgb_test_pred_for_submission = get_stacking_base_datasets(xgb_reg, X_features_np, y_target_np, test_ohe_processed_np, 5)[1]
lgbm_test_pred_for_submission = get_stacking_base_datasets(lgbm_reg, X_features_np, y_target_np, test_ohe_processed_np, 5)[1]

Stack_final_X_test_for_submission = np.concatenate((
    ridge_test_pred_for_submission,
    lasso_test_pred_for_submission,
    xgb_test_pred_for_submission,
    lgbm_test_pred_for_submission
), axis=1)

final_test_pred = meta_model_lasso.predict(Stack_final_X_test_for_submission)
print("스태킹 모델 테스트 데이터 예측 완료.")

# 블렌딩 방식 최종 제출 예측값도 생성
blended_pred_submission = blending_predict(models, test_ohe_processed)
blended_pred_submission_exp = np.expm1(blended_pred_submission)
blended_pred_submission_exp = np.nan_to_num(blended_pred_submission_exp, posinf=1e10, neginf=0)

# 예측값 복원 및 제출 파일 생성
final_test_pred_exp = np.expm1(final_test_pred)
final_test_pred_exp = np.nan_to_num(final_test_pred_exp, posinf=1e10, neginf=0)

sample = pd.read_csv('./sample_submission.csv')
# 스태킹 결과
sample["SalePrice"] = final_test_pred_exp
sample.to_csv("./submission_stacking.csv", index=False)
print("\nsubmission_stacking.csv 파일이 성공적으로 생성되었습니다.")
print("스태킹 제출 파일의 예측값 shape:", final_test_pred_exp.shape)

# 블렌딩 결과
sample_blend = pd.read_csv('./sample_submission.csv')
sample_blend["SalePrice"] = blended_pred_submission_exp
sample_blend.to_csv("./submission_blending.csv", index=False)
print("submission_blending.csv 파일이 성공적으로 생성되었습니다.")
print("블렌딩 제출 파일의 예측값 shape:", blended_pred_submission_exp.shape)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
스태킹 기반 데이터 생성 완료.

메타 모델 학습 시작...
메타 모델 학습 완료.

스태킹 회귀 모델의 최종 RMSE 값은: 0.1114668129494901

테스트 데이터 전처리 완료. 전처리 후 테스트 데이터 shape: (292, 266)

스태킹 모델 테스트 데이터 예측 시작...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2906
[LightGBM] [Info] Number of data points in the train set: 932, number of used features: 154
[LightGBM] [Info] Start training from score 12.019843
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2943
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 159
[LightGBM] [Info] Start trai