In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import KNNImputer

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ID = test['Id']

# Remove outliers (GrLivArea > 4000 and SalePrice < 300000)
train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]

# Feature engineering
all_data = pd.concat((train.drop(['Id', 'SalePrice'], axis=1), test.drop('Id', axis=1))).reset_index(drop=True)

# Ensure numeric columns for age calculations
all_data['YrSold'] = all_data['YrSold'].astype(float)
all_data['YearBuilt'] = all_data['YearBuilt'].astype(float)
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype(float)

# Advanced features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['QualCond'] = all_data['OverallQual'] * all_data['OverallCond']

# Handle missing values
# KNN Imputer for LotFrontage
imputer = KNNImputer(n_neighbors=5)
all_data['LotFrontage'] = imputer.fit_transform(all_data[['LotFrontage']])[:, 0]

# Fill other missing values
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
all_data[numeric_feats] = all_data[numeric_feats].fillna(0)
categorical_feats = all_data.dtypes[all_data.dtypes == "object"].index
all_data[categorical_feats] = all_data[categorical_feats].fillna('None')

# Skew transformation for numeric features
skewed_feats = all_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewness = skewed_feats[skewed_feats > 0.75].index
all_data[skewness] = np.log1p(all_data[skewness])

# Target encoding for Neighborhood (mean SalePrice per neighborhood from train)
neigh_map = train.groupby('Neighborhood')['SalePrice'].mean().to_dict()
all_data['Neighborhood_Encoded'] = all_data['Neighborhood'].map(neigh_map)
all_data['Neighborhood_Encoded'] = all_data['Neighborhood_Encoded'].fillna(train['SalePrice'].mean())  # Fix: Use train SalePrice mean

# One-hot encoding for categoricals
all_data = pd.get_dummies(all_data.drop('Neighborhood', axis=1))  # Drop Neighborhood since encoded
all_data = all_data.fillna(all_data.mean())

# Split back to train/test
ntrain = train.shape[0]
X = all_data[:ntrain]
X_test = all_data[ntrain:]
y = np.log1p(train['SalePrice'])  # Log transform target

# Define base models
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=3))
GBoost = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5)
model_xgb = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3,
                         min_child_weight=1.7817, n_estimators=1500, reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, verbosity=0, random_state=7, n_jobs=-1)
model_lgb = LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=600,
                          max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319,
                          feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
model_cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, silent=True, random_seed=42)

# Stacking setup with Lasso meta-learner
estimators = [
    ('lasso', lasso),
    ('enet', ENet),
    ('gboost', GBoost),
    ('xgb', model_xgb),
    ('lgb', model_lgb),
    ('cat', model_cat)
]
stack = StackingRegressor(estimators=estimators, final_estimator=Lasso(alpha=0.0005), cv=10, n_jobs=-1)

# Local CV evaluation (RMSLE)
def rmsle_cv(model):
    kf = KFold(10, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(stack)
print(f"Stacking CV score: {score.mean():.5f} (std: {score.std():.5f})")

# Fit and predict
stack.fit(X, y)
preds = np.expm1(stack.predict(X_test))

# Create submission
submission = pd.DataFrame({"Id": test_ID, "SalePrice": preds})
submission.to_csv("new_submission.csv", index=False)
print("New submission created! Submit to Kaggle to check your score.")

Stacking CV score: 0.10591 (std: 0.01383)
New submission created! Submit to Kaggle to check your score.
