In [1]:
#Import Library
import pandas as pd
import numpy as np

# Model & tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load data train & test
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

print(train.shape, test.shape)

(1460, 81) (1459, 80)


# **Feature Engineering**

In [3]:
test_ID = test["Id"] #save Id for submission
all_data = pd.concat([train.drop("SalePrice", axis=1), test], axis=0) 
all_data.columns = all_data.columns.str.replace(" ", "_")
all_data = pd.get_dummies(all_data) #one-hot encoding
all_data = all_data.fillna(all_data.median()) #Fill in missing values with median
X = all_data[:train.shape[0]]
X_test_final = all_data[train.shape[0]:]
y = np.log1p(train["SalePrice"]) # Log-transform target

# **Split Train/Validation**

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# **Baseline Model**

In [5]:
ridge = Ridge(alpha=10)

#XGBRegressor
xgb = XGBRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, random_state=42,verbosity=0 
)

#LGBMRegressor
X.columns = X.columns.str.replace(' ', '_')
lgbm = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,                  
    min_data_in_leaf=5,
    min_gain_to_split=0.0,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    lambda_l1=0.1,                   
    lambda_l2=0.1,
    force_row_wise=True,            
    random_state=42,
    verbose=-1 
)

# **Make Stacking Regressor**

In [6]:
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=10))
])

stack_model = StackingRegressor(
    estimators=[
        ('ridge', ridge_pipe),
        ('xgb', xgb),
        ('lgbm', lgbm)
    ],
    final_estimator=Ridge(alpha=1),
    cv=5,
    passthrough=False  
)

stack_model.fit(X_train, y_train)
val_score = stack_model.score(X_val, y_val)
print(f"Stacking R^2 (Validation): {val_score:.4f}")

Stacking R^2 (Validation): 0.9115


# **Evaluation**

In [7]:
y_pred = stack_model.predict(X_val)
df_eval = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_pred
})
df_eval['Error'] = df_eval['Actual'] - df_eval['Predicted']
df_eval['Error_Percent'] = (df_eval['Error'] / df_eval['Actual']) * 100
r2 = r2_score(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

R²: 0.9115
RMSE: 0.1285


# **Prediction on test set**

In [8]:
#Prediction on Test Set
preds = stack_model.predict(X_test_final)
preds = np.expm1(preds)

#create submission file
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
