In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score


In [16]:
X_full = pd.read_csv('../dataset/train.csv', index_col='Id')
X_test_full = pd.read_csv('../dataset/test.csv', index_col='Id')

X_test = X_test_full.copy()
X = X_full.copy()
y = X.pop('SalePrice')
y = np.log1p(y)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

categorical_cols = [col for col in X_train.columns if X[col].dtype == "object"]
numerical_cols = [col for col in X_train.columns if X[col].dtype in ['int64', 'float64']]

In [17]:
estimators = [
    ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.03, max_depth=3, subsample=0.9, colsample_bytree=0.7, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('lasso', Lasso(alpha=0.0005, random_state=42))
]

stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=RidgeCV(),  
    cv=5,
    n_jobs=-1
)

In [18]:
class CustomDataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, lotfrontage_median=0):
        self.lotfrontage_median  = lotfrontage_median
        
    def fit(self, X, y=None):
        self.lotfrontage_median = X.groupby('Neighborhood')['LotFrontage'].median()
        return self
        
    def transform(self, X):
        X = X.copy()
        X['LotFrontage'] = X['LotFrontage'].fillna(X['Neighborhood'].map(self.lotfrontage_median))
        return X

In [19]:
class RareLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.rare_labels = {}

    def fit(self, X, y=None):
        self.rare_labels = {}
        cat_features = X.select_dtypes(include=['object']).columns

        for col in cat_features:
            freq = X[col].value_counts(normalize=True)
            rare_cats = freq[freq < self.threshold].index
            self.rare_labels[col] = rare_cats

        return self

    def transform(self, X):
        X = X.copy()
        for col, rare_cats in self.rare_labels.items():
            X[col] = X[col].apply(lambda x: 'Rare' if x in rare_cats else x)
        return X

In [20]:
class SkewnessTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold=0.75):
        self.skew_threshold = skew_threshold
        self.skewed_features = []

    def fit(self, X, y=None):
        numeric_feats = X.select_dtypes(include=['int64', 'float64']).columns
        skewness = X[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
        self.skewed_features = skewness[abs(skewness) > self.skew_threshold].index.tolist()

        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.skewed_features:
            X[feature] = np.log1p(X[feature])  # log(1 + x)

        return X

In [25]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value='None',strategy='constant')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer( fill_value = 0))      
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


pipeline = Pipeline(steps=[
    ('cleaner', CustomDataCleaner()),     
    ('rare_label_encoder', RareLabelEncoder()),
    ('skewness_transformer', SkewnessTransformer()),
    ('preprocessor', preprocessor),       # One-hot Encoding
    ('model', stacking_model)
])


In [26]:
pipeline.fit(X_train,y_train)
print("Training completed!")

Training completed!


In [27]:
y_pred = pipeline.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"RMSE: {rmse:.4f}") 
scores = cross_val_score(pipeline, X, y, scoring='neg_root_mean_squared_error',cv=5)
print(f"Cross-Validation RMSE: {-scores.mean():.4f}") 

RMSE: 0.0993
Cross-Validation RMSE: 0.1174


In [24]:
y_pred = pipeline.predict(X_test)
y_pred = pd.Series(np.expm1(y_pred),index = X_test.index)
y_pred.to_csv('submission.csv',header = ['SalePrice'])