In [135]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics  import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [136]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [137]:
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
train_df['Total_SF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['Total_SF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']
train_df['Remodel_Age'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['Remodel_Age'] = test_df['YrSold'] - test_df['YearRemodAdd']

train_df.drop(columns=['Id', 'YrSold', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearRemodAdd'], inplace=True)
test_df.drop(columns=['Id', 'YrSold', 'YearBuilt',  'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearRemodAdd'], inplace=True)


In [138]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, nan_thresh=0.5):
        self.nan_thresh = nan_thresh
        self.drop_columns_ = []
    
    def fit(self, X, y=None):
        nan_ratio = X.isnull().sum() / len(X)
        self.drop_columns_ = nan_ratio[nan_ratio > self.nan_thresh].index.tolist()
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.drop_columns_:
            X_copy.drop(columns=self.drop_columns_, inplace=True)
        return X_copy

In [139]:
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, z_thresh=3.0):
        self.z_thresh = z_thresh
        self.bounds_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            mean = X[col].mean()
            std = X[col].std()
            upper_bound = mean + self.z_thresh * std
            lower_bound = mean - self.z_thresh * std
            self.bounds_[col] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, (lower, upper) in self.bounds_.items():
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].clip(lower=lower, upper=upper)
        return X_copy

In [140]:
class CategoricalTypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.object_columns = []
    
    def fit(self, x, y=None):
        print("\n--- Fitting CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.object_columns = x.select_dtypes(include='object').columns.tolist()
        if not self.object_columns:
            return self
        
        print(f"Found object columns to encode: {self.object_columns}")
        self._encoder.fit(x[self.object_columns])
        return self
    
    def transform(self, x):
        print("\n--- Transforming with CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        xcopy = x.copy()

        if not self.object_columns:
            return xcopy
        
        encoded = self._encoder.transform(xcopy[self.object_columns])

        encoded_df = pd.DataFrame(
            encoded,
            columns=self._encoder.get_feature_names_out(self.object_columns),
            index=xcopy.index
        )

        xcopy.drop(columns=self.object_columns, inplace=True)
        x_final = pd.concat([xcopy, encoded_df], axis=1)
        
        print(f"Successfully encoded and added {len(encoded_df.columns)} new columns.")
        return x_final

In [141]:
class NaNImputer(BaseEstimator, TransformerMixin):
    """
    A robust imputer that learns all columns of a given type from the training
    data and is prepared to impute any of them in the test data.
    """
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.numerical_imputer = None
        self.categorical_imputer = None
        self.numerical_cols = []
        self.categorical_cols = []

    def fit(self, x, y=None):
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.numerical_cols = x.select_dtypes(include=np.number).columns.tolist()
        self.categorical_cols = x.select_dtypes(include='object').columns.tolist()

        if self.numerical_cols:
            self.numerical_imputer = KNNImputer(n_neighbors=self.n_neighbors)
            self.numerical_imputer.fit(x[self.numerical_cols])
            
        if self.categorical_cols:
            self.categorical_imputer = SimpleImputer(strategy='most_frequent')
            self.categorical_imputer.fit(x[self.categorical_cols])

        return self
    
    def transform(self, x):
        xcopy = x.copy()
        
        # Apply the transformation to all learned columns.
        if self.numerical_cols and self.numerical_imputer:
            xcopy[self.numerical_cols] = self.numerical_imputer.transform(xcopy[self.numerical_cols])
        
        if self.categorical_cols and self.categorical_imputer:
            xcopy[self.categorical_cols] = self.categorical_imputer.transform(xcopy[self.categorical_cols])
        
        return xcopy

In [142]:
preprocessing_pipeline = Pipeline(steps=[
    ('nan_dropper', ColumnDropper(nan_thresh=0.5)), 
    ('outlier_capper', OutlierCapper(z_thresh=3.0)),
    ('imputer', NaNImputer(n_neighbors=5)),
    ('encoder', CategoricalTypeConverter())
])

In [143]:
def train_and_test(xtrain, ytrain, xtest, ytest, model):
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5, 6],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'n_estimators': [500, 1000, 2000],
        'gamma': [0, 0.1, 0.2]
    }

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=25,
        scoring='neg_root_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

    random_search.fit(xtrain, ytrain)
    print("\n--- Hyperparameter Tuning Results ---")
    print(f"Best parameters found: {random_search.best_params_}")

    best_model = random_search.best_estimator_
    
    ypred_log = best_model.predict(xtest)

    ypred_original = np.expm1(ypred_log)
    ytest_original = np.expm1(ytest)
    
    mse = mean_squared_error(ytest_original, ypred_original)
    rmse = np.sqrt(mse)
    
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'Root Mean Squared Error (RMSE): ${rmse:.2f}')

In [144]:
model1 = xgb.XGBRegressor(
    random_state=42,
    n_jobs=-1
)

x = train_df.drop(columns=['SalePrice'])
x = preprocessing_pipeline.fit_transform(x)
y = np.log1p(train_df['SalePrice'])

# xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
# train_and_test(xtrain, ytrain, xtest, ytest, model1)


--- Fitting CategoricalTypeConverter ---
Found object columns to encode: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

--- Transforming with CategoricalTypeConverter ---
Successfully encoded and added 235 new columns.


In [145]:
# model2 = xgb.XGBRegressor(
#     subsample=0.6,
#     n_estimators=1000,
#     max_depth=6,
#     learning_rate=0.01,
#     gamma=0,
#     colsample_bytree=0.6
# )
# model2.fit(x, y)
# test_pred = np.expm1(model2.predict(preprocessing_pipeline.transform(test_df)))

In [146]:
models = {
    'Lasso': Pipeline([('scaler', StandardScaler()), ('model', Lasso(alpha=0.0005, random_state=42))]),
    'Ridge': Pipeline([('scaler', StandardScaler()), ('model', Ridge(alpha=10.0, random_state=42))]),
    'ElasticNet': Pipeline([('scaler', StandardScaler()), ('model', ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42))]),
    'RandomForest': RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42, n_jobs=-1)
}

In [147]:
test_df = preprocessing_pipeline.transform(test_df)


--- Transforming with CategoricalTypeConverter ---
Successfully encoded and added 235 new columns.




In [148]:
test_df[test_df.isnull().any(axis=1)]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial


In [149]:
predictions = {}

print("--- Training models and making predictions ---")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x, y)
    predictions[name] = model.predict(test_df)

predictions_df = pd.DataFrame(predictions)

ensemble_predictions_log = predictions_df.mean(axis=1)
final_predictions = np.expm1(ensemble_predictions_log)

--- Training models and making predictions ---
Training Lasso...
Training Ridge...
Training ElasticNet...
Training RandomForest...
Training GradientBoosting...
Training XGBoost...


In [150]:
submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': final_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")
print(submission_df.head())

KeyError: 'Id'