In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from catboost import CatBoostRegressor

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ID = test['Id']

# Load best previous submission for blending
prev_submission = pd.read_csv('new_submission.csv')  # Your 0.11868 submission

# Check for unseen Neighborhoods in test set
unseen_neighs = set(test['Neighborhood']) - set(train['Neighborhood'])
print(f"Unseen Neighborhoods in test: {unseen_neighs}")

# Remove outliers (GrLivArea > 4000 and SalePrice < 300000)
train = train[~((train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000))]

# Feature engineering
all_data = pd.concat((train.drop(['Id', 'SalePrice'], axis=1), test.drop('Id', axis=1))).reset_index(drop=True)

# Ensure numeric columns
all_data['YrSold'] = all_data['YrSold'].astype(float)
all_data['YearBuilt'] = all_data['YearBuilt'].astype(float)
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype(float)

# Advanced features
all_data['TotalSF'] = all_data['TotalBsmtSF'].fillna(0) + all_data['1stFlrSF'].fillna(0) + all_data['2ndFlrSF'].fillna(0)
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']
all_data['TotalBath'] = all_data['FullBath'].fillna(0) + all_data['BsmtFullBath'].fillna(0) + 0.5 * (all_data['HalfBath'].fillna(0) + all_data['BsmtHalfBath'].fillna(0))
all_data['TotalPorchSF'] = all_data['OpenPorchSF'].fillna(0) + all_data['EnclosedPorch'].fillna(0) + all_data['3SsnPorch'].fillna(0) + all_data['ScreenPorch'].fillna(0)
all_data['Qual_GrLiv'] = all_data['OverallQual'] * all_data['GrLivArea']

# Handle missing values
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
all_data['GarageArea'] = all_data['GarageArea'].fillna(0)  # No garage
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].fillna(0)  # No basement
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
all_data[numeric_feats] = all_data[numeric_feats].fillna(0)
categorical_feats = all_data.dtypes[all_data.dtypes == "object"].index
all_data[categorical_feats] = all_data[categorical_feats].fillna('None')

# Log-transform skewed features
skewed_feats = ['LotArea', 'GrLivArea', 'TotalSF', 'TotalPorchSF', 'GarageArea', 'TotalBsmtSF']
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# Frequency encoding for Neighborhood
neigh_freq = train['Neighborhood'].value_counts().to_dict()
median_freq = np.median(list(neigh_freq.values()))
all_data['Neighborhood_Encoded'] = all_data['Neighborhood'].map(neigh_freq).fillna(median_freq)

# One-hot encoding for categoricals (excluding Neighborhood)
all_data = pd.get_dummies(all_data.drop('Neighborhood', axis=1))
all_data = all_data.fillna(all_data.mean())

# Split back to train/test
ntrain = train.shape[0]
X = all_data[:ntrain]
X_test = all_data[ntrain:]
y = np.log1p(train['SalePrice'])

# Define tuned CatBoost model
model_cat = CatBoostRegressor(iterations=600, learning_rate=0.05, depth=4, silent=True, random_seed=42,
                              l2_leaf_reg=7.0, bagging_temperature=2.0)

# Local CV evaluation (RMSLE)
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(model_cat)
print(f"CatBoost CV score: {score.mean():.5f} (std: {score.std():.5f})")

# Fit and predict
model_cat.fit(X, y)
preds = np.expm1(model_cat.predict(X_test))

# Blend with best previous submission (0.05 new + 0.95 prev)
blended_preds = 0.05 * preds + 0.95 * prev_submission['SalePrice'].values

# Create submission
submission = pd.DataFrame({"Id": test_ID, "SalePrice": blended_preds})
submission.to_csv("tuned_catboost_submission.csv", index=False)
print(f"Tuned CatBoost submission created at {pd.Timestamp.now('Asia/Kolkata').strftime('%I:%M %p IST, %A, %B %d, %Y')}! Submit to Kaggle to check your score.")

Unseen Neighborhoods in test: set()
CatBoost CV score: 0.11450 (std: 0.00797)
Tuned CatBoost submission created at 10:04 PM IST, Saturday, October 11, 2025! Submit to Kaggle to check your score.
