In [15]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew
from scipy.stats import skew
from scipy.stats import boxcox
from scipy.special import boxcox1p

In [16]:
train = pd.read_csv("../input/House_Prices/train.csv")
test = pd.read_csv("../input/House_Prices/test.csv")

output_dir = "../submission/"
os.makedirs(output_dir, exist_ok=True)

In [17]:
def fill_missing_values(df):
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
    for col in none_cols:
        df[col] = df[col].fillna("None")

    zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
    for col in zero_cols:
        df[col] = df[col].fillna(0)
        
    return df

In [18]:
def convert_dtypes(df):
    cols_to_str = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
    for col in cols_to_str:
        df[col] = df[col].apply(str)
    return df

In [19]:
# 1. 外れ値の削除
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 20000)].index)

# 2. データの結合
ntrain = train.shape[0] 
ntest = test.shape[0]
y_train = np.log1p(train["SalePrice"])
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)

# 3. 欠損値補完
all_data = fill_missing_values(all_data)

# 4. 特徴量エンジニアリング
all_data = convert_dtypes(all_data)

# 5. 特徴量合成
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# 6. 数値特徴量の抽出。
numeric_feats = all_data.select_dtypes(include=["number"]).columns

# 7. 歪度の計算
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

# 8. Box-Cox変換の適用
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
l_opt = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], l_opt)

# 9. ダミー変数化
all_data = pd.get_dummies(all_data)

# 10. 再分割 
train = all_data[:ntrain]
test = all_data[ntrain:]

In [20]:
train.drop("Id", axis=1)

Unnamed: 0,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,5.831328,19.212182,2.440268,14.187527,14.187527,8.059126,11.170327,0.000000,7.483296,11.692623,...,False,False,False,True,False,False,False,False,True,False
1,6.221214,19.712205,2.259674,14.145138,14.145138,0.000000,12.062832,0.000000,8.897844,12.792276,...,False,False,False,True,False,False,False,False,True,False
2,5.914940,20.347241,2.440268,14.184404,14.185966,7.646538,10.200343,0.000000,9.917060,11.892039,...,False,False,False,True,False,False,False,False,True,False
3,5.684507,19.691553,2.440268,14.047529,14.135652,0.000000,8.274266,0.000000,10.468500,11.354094,...,False,False,False,True,True,False,False,False,False,False
4,6.314735,21.325160,2.602594,14.182841,14.182841,9.391827,10.971129,0.000000,10.221051,12.510588,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,5.744420,18.960528,2.259674,14.181278,14.182841,0.000000,0.000000,0.000000,11.990298,11.990298,...,False,False,False,True,False,False,False,False,True,False
1456,6.337529,20.994868,2.259674,14.148295,14.164038,7.003881,11.473247,7.659675,10.692807,13.385607,...,False,False,False,True,False,False,False,False,True,False
1457,5.859551,19.476345,2.440268,14.089451,14.192207,0.000000,8.823108,0.000000,11.759412,12.528113,...,False,False,False,True,False,False,False,False,True,False
1458,5.914940,19.760176,2.055642,14.103852,14.176584,0.000000,5.321541,12.206046,0.000000,12.338074,...,False,False,False,True,False,False,False,False,True,False


In [21]:
all_data.isnull().any(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
2914    False
2915    False
2916    False
2917    False
2918    False
Length: 2919, dtype: bool

In [22]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

n_folds = 5
kf = KFold(n_folds, shuffle=True, random_state=42)

In [23]:
def cv_rmse(model):
    mse = -cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf)
    rmse = np.sqrt(mse)
    return rmse

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso

model_lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))

score = cv_rmse(model_lasso)

print(f"Lasso score: {score.mean():.4f} (std: {score.std():.4f})")

Lasso score: 0.1271 (std: 0.0235)


In [25]:
# 1. 全データでの学習
model_lasso.fit(train, y_train)

# 2. テストデータの予測
log_predictions = model_lasso.predict(test)

# 3. 逆変換
# 対数スケール(log)で予測された値を、元の金額(ドル)に戻す
final_predictions = np.expm1(log_predictions)

# 4. 提出ファイルの作成
# IDを取得するために、元のtest.csvを読み込む。
test_ids = pd.read_csv('../input/House_Prices/test.csv')['Id']

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_predictions
})

# CSVに出力
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")

Submission file created successfully!


In [26]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

model_xgb = xgb.XGBRegressor(random_state=42,
                            n_jobs=1,
                            objective="reg:squarederror",
                            colsample_bytree=0.7,
                            learning_rate=0.05,
                            max_depth=3,
                            n_estimators=1000,
                            subsample=0.9,
                            )
param_grid = {
    'gamma': [0.03, 0.05, 0.07],
}

grid_search = GridSearchCV(
    estimator=model_xgb,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error', # 評価指標：RMSE
    verbose=1
)

grid_search.fit(train, y_train)

print(f"最良のパラメータ: {grid_search.best_params_}")
print(f"最良のスコア (RMSE): {-grid_search.best_score_:.4f}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
最良のパラメータ: {'gamma': 0.03}
最良のスコア (RMSE): 0.1243


In [27]:
# 現在のモデルが受け付ける全パラメータを表示
print(model_xgb.get_params().keys())

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'feature_types', 'feature_weights', 'gamma', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_threshold', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'multi_strategy', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])


In [30]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.7,
                            learning_rate=0.05,
                            max_depth=3,
                            n_estimators=1000,
                            subsample=0.9,
                            )

model_xgb.fit(train, y_train)

xgb_pred = np.expm1(model_xgb.predict(test))

ensemble = final_predictions * 0.5 + xgb_pred * 0.5

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": xgb_pred
})

save_path = os.path.join(output_dir, "submission_xgboost.csv")
submission.to_csv(save_path, index=False)