In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ID = test['Id']

# Load previous submissions for blending
prev_submission_1 = pd.read_csv('new_submission.csv')  # Your 0.11868 submission
prev_submission_2 = pd.read_csv('blended_submission.csv')  # Your 0.12146 submission

# Check for unseen Neighborhoods in test set
unseen_neighs = set(test['Neighborhood']) - set(train['Neighborhood'])
print(f"Unseen Neighborhoods in test: {unseen_neighs}")

# Remove outliers (GrLivArea > 4000 and SalePrice < 300000)
train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]

# Feature engineering
all_data = pd.concat((train.drop(['Id', 'SalePrice'], axis=1), test.drop('Id', axis=1))).reset_index(drop=True)

# Ensure numeric columns
all_data['YrSold'] = all_data['YrSold'].astype(float)
all_data['YearBuilt'] = all_data['YearBuilt'].astype(float)
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype(float)

# Advanced features
all_data['TotalSF'] = all_data['TotalBsmtSF'].fillna(0) + all_data['1stFlrSF'].fillna(0) + all_data['2ndFlrSF'].fillna(0)
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['QualCond'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['TotalBath'] = all_data['FullBath'].fillna(0) + all_data['BsmtFullBath'].fillna(0) + 0.5 * (all_data['HalfBath'].fillna(0) + all_data['BsmtHalfBath'].fillna(0))
all_data['TotalPorchSF'] = all_data['OpenPorchSF'].fillna(0) + all_data['EnclosedPorch'].fillna(0) + all_data['3SsnPorch'].fillna(0) + all_data['ScreenPorch'].fillna(0)
all_data['Qual_SF'] = all_data['OverallQual'] * all_data['TotalSF']

# Handle missing values
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
all_data[numeric_feats] = all_data[numeric_feats].fillna(0)
categorical_feats = all_data.dtypes[all_data.dtypes == "object"].index
all_data[categorical_feats] = all_data[categorical_feats].fillna('None')

# Clip extreme values to match train distribution
ntrain = train.shape[0]
for col in ['LotArea', 'GrLivArea', 'TotalSF']:
    if col in train.columns:
        upper_limit = train[col].quantile(0.99)  # Use train for original columns
    else:
        upper_limit = all_data[:ntrain][col].quantile(0.99)  # Use all_data[:ntrain] for derived columns
    all_data[col] = all_data[col].clip(upper=upper_limit)

# Log-transform skewed features
skewed_feats = ['LotArea', 'GrLivArea', 'TotalSF', 'TotalPorchSF']
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# Frequency encoding for Neighborhood
neigh_freq = all_data['Neighborhood'].value_counts().to_dict()
all_data['Neighborhood_Encoded'] = all_data['Neighborhood'].map(neigh_freq)

# One-hot encoding for categoricals
all_data = pd.get_dummies(all_data.drop('Neighborhood', axis=1))
all_data = all_data.fillna(all_data.mean())

# Split back to train/test
X = all_data[:ntrain]
X_test = all_data[ntrain:]
y = np.log1p(train['SalePrice'])

# Define base models
model_xgb = XGBRegressor(colsample_bytree=0.5, learning_rate=0.05, max_depth=3,
                         n_estimators=800, reg_alpha=0.7, reg_lambda=1.2,
                         subsample=0.6, verbosity=0, random_state=7, n_jobs=-1)
model_lgb = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.05, n_estimators=400,
                          max_bin=50, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.3,
                          min_data_in_leaf=5, min_sum_hessian_in_leaf=10, random_state=42)
model_cat = CatBoostRegressor(iterations=600, learning_rate=0.05, depth=5, silent=True, random_seed=42)
model_gboost = GradientBoostingRegressor(n_estimators=1500, learning_rate=0.05, max_depth=3, max_features='sqrt',
                                        min_samples_leaf=10, min_samples_split=10, loss='huber', random_state=5)

# Stacking setup with Ridge meta-learner
estimators = [
    ('xgb', model_xgb),
    ('lgb', model_lgb),
    ('cat', model_cat),
    ('gboost', model_gboost)
]
stack = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0), cv=5, n_jobs=-1)

# Local CV evaluation (RMSLE)
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(stack)
print(f"Stacking CV score: {score.mean():.5f} (std: {score.std():.5f})")

# Fit and predict
stack.fit(X, y)
preds = np.expm1(stack.predict(X_test))

# Triple-blend with previous submissions (0.5 new + 0.3 prev_1 + 0.2 prev_2)
blended_preds = 0.5 * preds + 0.3 * prev_submission_1['SalePrice'].values + 0.2 * prev_submission_2['SalePrice'].values

# Create submission
submission = pd.DataFrame({"Id": test_ID, "SalePrice": blended_preds})
submission.to_csv("triple_submission.csv", index=False)
print("Triple-blended submission created! Submit to Kaggle to check your score.")

Unseen Neighborhoods in test: set()
Stacking CV score: 0.11258 (std: 0.00904)
Triple-blended submission created! Submit to Kaggle to check your score.
