In [63]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics  import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso, Ridge, ElasticNet, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [34]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_id = test_df['Id']

In [35]:
train_df['HouseAge'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
train_df['Total_SF'] = train_df['TotalBsmtSF'] + train_df['1stFlrSF'] + train_df['2ndFlrSF']
test_df['Total_SF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF']
train_df['Remodel_Age'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['Remodel_Age'] = test_df['YrSold'] - test_df['YearRemodAdd']

train_df.drop(columns=['Id', 'YrSold', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearRemodAdd'], inplace=True)
test_df.drop(columns=['Id', 'YrSold', 'YearBuilt',  'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearRemodAdd'], inplace=True)


In [36]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, nan_thresh=0.5):
        self.nan_thresh = nan_thresh
        self.drop_columns_ = []
    
    def fit(self, X, y=None):
        nan_ratio = X.isnull().sum() / len(X)
        self.drop_columns_ = nan_ratio[nan_ratio > self.nan_thresh].index.tolist()
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.drop_columns_:
            X_copy.drop(columns=self.drop_columns_, inplace=True)
        return X_copy

In [37]:
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, z_thresh=3.0):
        self.z_thresh = z_thresh
        self.bounds_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            mean = X[col].mean()
            std = X[col].std()
            upper_bound = mean + self.z_thresh * std
            lower_bound = mean - self.z_thresh * std
            self.bounds_[col] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, (lower, upper) in self.bounds_.items():
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].clip(lower=lower, upper=upper)
        return X_copy

In [38]:
class CategoricalTypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.object_columns = []
    
    def fit(self, x, y=None):
        print("\n--- Fitting CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.object_columns = x.select_dtypes(include='object').columns.tolist()
        if not self.object_columns:
            return self
        
        print(f"Found object columns to encode: {self.object_columns}")
        self._encoder.fit(x[self.object_columns])
        return self
    
    def transform(self, x):
        print("\n--- Transforming with CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        xcopy = x.copy()

        if not self.object_columns:
            return xcopy
        
        encoded = self._encoder.transform(xcopy[self.object_columns])

        encoded_df = pd.DataFrame(
            encoded,
            columns=self._encoder.get_feature_names_out(self.object_columns),
            index=xcopy.index
        )

        xcopy.drop(columns=self.object_columns, inplace=True)
        x_final = pd.concat([xcopy, encoded_df], axis=1)
        
        print(f"Successfully encoded and added {len(encoded_df.columns)} new columns.")
        return x_final

In [39]:
class NaNImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.numerical_imputer = None
        self.categorical_imputer = None
        self.numerical_cols = []
        self.categorical_cols = []

    def fit(self, x, y=None):
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.numerical_cols = x.select_dtypes(include=np.number).columns.tolist()
        self.categorical_cols = x.select_dtypes(include='object').columns.tolist()

        if self.numerical_cols:
            self.numerical_imputer = KNNImputer(n_neighbors=self.n_neighbors)
            self.numerical_imputer.fit(x[self.numerical_cols])
            
        if self.categorical_cols:
            self.categorical_imputer = SimpleImputer(strategy='most_frequent')
            self.categorical_imputer.fit(x[self.categorical_cols])

        return self
    
    def transform(self, x):
        xcopy = x.copy()
        if self.numerical_cols and self.numerical_imputer:
            xcopy[self.numerical_cols] = self.numerical_imputer.transform(xcopy[self.numerical_cols])
        
        if self.categorical_cols and self.categorical_imputer:
            xcopy[self.categorical_cols] = self.categorical_imputer.transform(xcopy[self.categorical_cols])
        
        return xcopy

In [40]:
preprocessing_pipeline = Pipeline(steps=[
    ('nan_dropper', ColumnDropper(nan_thresh=0.5)), 
    ('outlier_capper', OutlierCapper(z_thresh=3.0)),
    ('imputer', NaNImputer(n_neighbors=5)),
    ('encoder', CategoricalTypeConverter())
])

In [41]:
param_grid_lasso = {
    'model__alpha': [0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10]
}

param_grid_ridge = {
    'model__alpha': [0.01, 0.1, 1, 10, 100, 200]
}

param_grid_elasticnet = {
    'model__alpha': [0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0]
}

param_grid_rf = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [5, 10, 15, 20, None],
    'max_features': ['sqrt', 'log2', 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [500, 1000, 2000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'max_features': ['sqrt', 'log2']
}

param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'n_estimators': [500, 1000, 2000],
    'gamma': [0, 0.1, 0.2]
}

param_list = [param_grid_lasso, param_grid_ridge, param_grid_elasticnet, param_grid_rf, param_grid_gb, param_grid_xgb]

In [42]:
def train_and_test(xtrain, xtest, ytrain, ytest, model, idx):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_list[idx],
        n_iter=25,
        scoring='neg_root_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

    random_search.fit(xtrain, ytrain)
    print("\n--- Hyperparameter Tuning Results ---")
    print(f"Best parameters found: {random_search.best_params_}")

    best_model = random_search.best_estimator_
    
    ypred_log = best_model.predict(xtest)

    ypred_original = np.expm1(ypred_log)
    ytest_original = np.expm1(ytest)
    
    mse = mean_squared_error(ytest_original, ypred_original)
    rmse = np.sqrt(mse)
    
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'Root Mean Squared Error (RMSE): ${rmse:.2f}')

    return random_search.best_params_

In [43]:
model1 = xgb.XGBRegressor(
    random_state=42,
    n_jobs=-1
)

x = train_df.drop(columns=['SalePrice'])
x = preprocessing_pipeline.fit_transform(x)
y = np.log1p(train_df['SalePrice'])

# xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
# train_and_test(xtrain, ytrain, xtest, ytest, model1)


--- Fitting CategoricalTypeConverter ---
Found object columns to encode: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

--- Transforming with CategoricalTypeConverter ---
Successfully encoded and added 235 new columns.


In [44]:
# model2 = xgb.XGBRegressor(
#     subsample=0.6,
#     n_estimators=1000,
#     max_depth=6,
#     learning_rate=0.01,
#     gamma=0,
#     colsample_bytree=0.6
# )
# model2.fit(x, y)
# test_pred = np.expm1(model2.predict(preprocessing_pipeline.transform(test_df)))

In [45]:
models = {
    'Lasso': Pipeline([('scaler', StandardScaler()), ('model', Lasso(random_state=42))]),
    'Ridge': Pipeline([('scaler', StandardScaler()), ('model', Ridge(random_state=42))]),
    'ElasticNet': Pipeline([('scaler', StandardScaler()), ('model', ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42))]),
    'RandomForest': RandomForestRegressor(n_estimators=500, max_depth=6, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42, n_jobs=-1)
}

In [46]:
test_df = preprocessing_pipeline.transform(test_df)


--- Transforming with CategoricalTypeConverter ---
Successfully encoded and added 235 new columns.




In [47]:
test_df[test_df.isnull().any(axis=1)]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial


In [48]:
predictions = {}
best_params = {}

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
idx = 0

print("--- Training models and making predictions ---")
for name, model in models.items():
    print(f"Training {name}...")
    best_params[name] = train_and_test(xtrain, xtest, ytrain, ytest, model, idx)
    idx += 1

--- Training models and making predictions ---
Training Lasso...
Fitting 5 folds for each of 7 candidates, totalling 35 fits


  model = cd_fast.enet_coordinate_descent(



--- Hyperparameter Tuning Results ---
Best parameters found: {'model__alpha': 0.01}
Mean Squared Error (MSE): 356885593.92
Root Mean Squared Error (RMSE): $18891.42
Training Ridge...
Fitting 5 folds for each of 6 candidates, totalling 30 fits





--- Hyperparameter Tuning Results ---
Best parameters found: {'model__alpha': 200}
Mean Squared Error (MSE): 330597599.43
Root Mean Squared Error (RMSE): $18182.34
Training ElasticNet...
Fitting 5 folds for each of 25 candidates, totalling 125 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



--- Hyperparameter Tuning Results ---
Best parameters found: {'model__l1_ratio': 0.5, 'model__alpha': 0.01}
Mean Squared Error (MSE): 292584509.24
Root Mean Squared Error (RMSE): $17105.10
Training RandomForest...
Fitting 5 folds for each of 25 candidates, totalling 125 fits

--- Hyperparameter Tuning Results ---
Best parameters found: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 1.0, 'max_depth': 20}
Mean Squared Error (MSE): 588792954.77
Root Mean Squared Error (RMSE): $24265.06
Training GradientBoosting...
Fitting 5 folds for each of 25 candidates, totalling 125 fits

--- Hyperparameter Tuning Results ---
Best parameters found: {'subsample': 0.6, 'n_estimators': 2000, 'max_features': 'sqrt', 'max_depth': 5, 'learning_rate': 0.01}
Mean Squared Error (MSE): 380576159.12
Root Mean Squared Error (RMSE): $19508.36
Training XGBoost...
Fitting 5 folds for each of 25 candidates, totalling 125 fits

--- Hyperparameter Tuning Results ---
Best parameter

In [49]:
best_params_cleaned = {}
for name, params in best_params.items():
    if name in ['Lasso', 'Ridge', 'ElasticNet']:
        cleaned_params = {key.replace('model__', ''): value for key, value in params.items()}
        best_params_cleaned[name] = cleaned_params
    else:
        best_params_cleaned[name] = params
        
tuned_models = {
    'XGBoost': xgb.XGBRegressor(
        **best_params_cleaned['XGBoost'],
        random_state=42,
        n_jobs=-1
    ),
    'GradientBoosting': GradientBoostingRegressor(
        **best_params_cleaned['GradientBoosting'],
        random_state=42
    ),
    'RandomForest': RandomForestRegressor(
        **best_params_cleaned['RandomForest'],
        random_state=42,
        n_jobs=-1
    ),
    'Lasso': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(**best_params_cleaned['Lasso'], random_state=42))
    ]),
    'Ridge': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(**best_params_cleaned['Ridge'], random_state=42))
    ]),
    'ElasticNet': Pipeline([
        ('scaler', StandardScaler()),
        ('model', ElasticNet(**best_params_cleaned['ElasticNet'], random_state=42))
    ])
}

print("Successfully initialized all tuned models.")

Successfully initialized all tuned models.


In [59]:
final_model_predictions = {}

print("\n--- Training final ensemble models on full training data ---")
for name, model in tuned_models.items():
    print(f"Training {name}...")
    model.fit(x, y)
    final_model_predictions[name] = model.predict(test_df)

predictions_df = pd.DataFrame(final_model_predictions)

ensemble_predictions_log = predictions_df.mean(axis=1)

final_predictions = np.expm1(ensemble_predictions_log)

print("\n--- Final ensemble predictions generated successfully! ---")


--- Training final ensemble models on full training data ---
Training XGBoost...
Training GradientBoosting...
Training RandomForest...
Training Lasso...
Training Ridge...
Training ElasticNet...

--- Final ensemble predictions generated successfully! ---


In [64]:
base_models = list(tuned_models.items())
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=RidgeCV(),
    cv=5,
    n_jobs=-1
)

In [65]:
stacking.fit(x, y)
final_predictions = np.expm1(stacking.predict(test_df))

In [66]:
submission_df = pd.DataFrame({
    'Id': test_id,
    'SalePrice': final_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")
print(submission_df.head())

Submission file 'submission.csv' created successfully!
     Id      SalePrice
0  1461  123636.365258
1  1462  161488.591214
2  1463  186743.460412
3  1464  196524.789047
4  1465  188969.175155
