In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# ===============================================================
# 1. IMPORT LIBRARIES
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# ===============================================================
# 2. LOAD DATA
# ===============================================================
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

print(train.shape, test.shape)
train.head()


(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
# ===============================================================
# 3. REMOVE OUTLIERS (optional but improves CV)
# Example: remove houses with too large GrLivArea but low price
# ===============================================================
train = train[train['GrLivArea'] < 4500]

# Reset index after filtering
train.reset_index(drop=True, inplace=True)


In [3]:
# ===============================================================
# 4. TARGET VARIABLE
# Log-transform to fix skew
# ===============================================================
y = np.log1p(train['SalePrice'])

# Drop ID and target
train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test_ids = test['Id']
test.drop(['Id'], axis=1, inplace=True)




In [4]:
# ===============================================================
# 5. COMBINE TRAIN + TEST FOR CONSISTENT PREPROCESSING
# ===============================================================
full = pd.concat([train, test], axis=0)

# Identify numeric & categorical columns
numeric_features = full.select_dtypes(include=[np.number]).columns
categorical_features = full.select_dtypes(include=['object']).columns

print("Numeric:", len(numeric_features), "Categorical:", len(categorical_features))


Numeric: 36 Categorical: 43


In [5]:
# ===============================================================
# 6. HANDLE MISSING VALUES
# - numeric: fill with median
# - categorical: fill with mode
# ===============================================================
for col in numeric_features:
    full[col].fillna(full[col].median(), inplace=True)

for col in categorical_features:
    full[col].fillna(full[col].mode()[0], inplace=True)


In [6]:
# ===============================================================
# 7. ONE-HOT ENCODING
# ===============================================================
full = pd.get_dummies(full, drop_first=True)

# Split back to train/test
X = full[:len(y)]
X_test = full[len(y):]


In [7]:
# ===============================================================
# 8. STANDARDIZATION
# ===============================================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


In [8]:
# ===============================================================
# 9. DEFINE MODELS
# ===============================================================
lasso = Lasso(alpha=0.0005, max_iter=50000)
ridge = Ridge(alpha=10)
elastic = ElasticNet(alpha=0.0005, l1_ratio=0.5)

# STACKING MODEL (base models + meta model)
estimators = [
    ('ridge', ridge),
    ('lasso', lasso),
    ('elastic', elastic)
]

stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=300, random_state=42),
    passthrough=True
)


In [10]:
# ===============================================================
# 10. CROSS-VALIDATION FUNCTION
# ===============================================================
def rmse_cv(model):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X_scaled, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

print("Lasso CV RMSE:", rmse_cv(lasso).mean())
print("Ridge CV RMSE:", rmse_cv(ridge).mean())
print("ElasticNet CV RMSE:", rmse_cv(elastic).mean())
print("Stacking CV RMSE:", rmse_cv(stack_model).mean())


Lasso CV RMSE: 0.11646228969343413
Ridge CV RMSE: 0.11980750154010342
ElasticNet CV RMSE: 0.11826105982459294
Stacking CV RMSE: 0.11604062779739584


In [11]:
# ===============================================================
# 11. TRAIN FINAL STACKING MODEL
# ===============================================================
stack_model.fit(X_scaled, y)
preds = stack_model.predict(X_test_scaled)

# Reverse the log-transform
final_preds = np.expm1(preds)


In [12]:
# ===============================================================
# 12. CREATE SUBMISSION FILE
# ===============================================================
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,Id,SalePrice
0,1461,126445.078401
1,1462,158250.62555
2,1463,181117.351666
3,1464,195335.897701
4,1465,194153.964035
