In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import KNNImputer

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ID = test['Id']

# Remove outliers (GrLivArea > 4000 and SalePrice < 300000)
train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]

# Load previous submission for blending
prev_submission = pd.read_csv('new_submission.csv')  # Replace with your 0.11868 submission file

# Feature engineering
all_data = pd.concat((train.drop(['Id', 'SalePrice'], axis=1), test.drop('Id', axis=1))).reset_index(drop=True)

# Ensure numeric columns
all_data['YrSold'] = all_data['YrSold'].astype(float)
all_data['YearBuilt'] = all_data['YearBuilt'].astype(float)
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype(float)

# Advanced features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['QualCond'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['TotalBath'] = all_data['FullBath'] + all_data['BsmtFullBath'] + 0.5 * (all_data['HalfBath'] + all_data['BsmtHalfBath'])
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# Handle missing values
imputer = KNNImputer(n_neighbors=5)
all_data['LotFrontage'] = imputer.fit_transform(all_data[['LotFrontage']])[:, 0]
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
all_data[numeric_feats] = all_data[numeric_feats].fillna(0)
categorical_feats = all_data.dtypes[all_data.dtypes == "object"].index
all_data[categorical_feats] = all_data[categorical_feats].fillna('None')

# Skew transformation
skewed_feats = all_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewness = skewed_feats[skewed_feats > 0.75].index
all_data[skewness] = np.log1p(all_data[skewness])

# Ordinal encoding for Neighborhood (instead of target encoding)
neigh_order = train.groupby('Neighborhood')['SalePrice'].median().sort_values().index
neigh_map = {neigh: i for i, neigh in enumerate(neigh_order)}
all_data['Neighborhood_Encoded'] = all_data['Neighborhood'].map(neigh_map).fillna(len(neigh_order))

# One-hot encoding for categoricals
all_data = pd.get_dummies(all_data.drop('Neighborhood', axis=1))
all_data = all_data.fillna(all_data.mean())

# Split back to train/test
ntrain = train.shape[0]
X = all_data[:ntrain]
X_test = all_data[ntrain:]
y = np.log1p(train['SalePrice'])

# Define base models (simplified ensemble)
model_xgb = XGBRegressor(colsample_bytree=0.5, learning_rate=0.05, max_depth=3,
                         n_estimators=1000, reg_alpha=0.5, reg_lambda=1.0,
                         subsample=0.6, verbosity=0, random_state=7, n_jobs=-1)
model_lgb = LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=500,
                          max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.25,
                          min_data_in_leaf=6, min_sum_hessian_in_leaf=10, random_state=42)
model_cat = CatBoostRegressor(iterations=800, learning_rate=0.05, depth=6, silent=True, random_seed=42)

# Stacking setup with Ridge meta-learner
estimators = [
    ('xgb', model_xgb),
    ('lgb', model_lgb),
    ('cat', model_cat)
]
stack = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0), cv=5, n_jobs=-1)

# Local CV evaluation (RMSLE)
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(stack)
print(f"Stacking CV score: {score.mean():.5f} (std: {score.std():.5f})")

# Fit and predict
stack.fit(X, y)
preds = np.expm1(stack.predict(X_test))

# Blend with previous submission (0.7 new + 0.3 previous)
blended_preds = 0.7 * preds + 0.3 * prev_submission['SalePrice'].values

# Create submission
submission = pd.DataFrame({"Id": test_ID, "SalePrice": blended_preds})
submission.to_csv("blended_submission.csv", index=False)
print("Blended submission created! Submit to Kaggle to check your score.")

Stacking CV score: 0.11211 (std: 0.00792)
Blended submission created! Submit to Kaggle to check your score.
