In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from scipy import stats
from sklearn.metrics  import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [119]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [120]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, nan_thresh=0.5):
        self.nan_thresh = nan_thresh
        self.drop_columns_ = []
    
    def fit(self, X, y=None):
        nan_ratio = X.isnull().sum() / len(X)
        self.drop_columns_ = nan_ratio[nan_ratio > self.nan_thresh].index.tolist()
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.drop_columns_:
            X_copy.drop(columns=self.drop_columns_, inplace=True)
        return X_copy

In [121]:
class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self, z_thresh=3.0):
        self.z_thresh = z_thresh
        self.bounds_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=np.number).columns
        for col in numeric_cols:
            mean = X[col].mean()
            std = X[col].std()
            upper_bound = mean + self.z_thresh * std
            lower_bound = mean - self.z_thresh * std
            self.bounds_[col] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, (lower, upper) in self.bounds_.items():
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].clip(lower=lower, upper=upper)
        return X_copy

In [122]:
class CategoricalTypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.object_columns = []
    
    def fit(self, x, y=None):
        print("\n--- Fitting CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.object_columns = x.select_dtypes(include='object').columns.tolist()
        if not self.object_columns:
            return self
        
        print(f"Found object columns to encode: {self.object_columns}")
        self._encoder.fit(x[self.object_columns])
        return self
    
    def transform(self, x):
        print("\n--- Transforming with CategoricalTypeConverter ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        xcopy = x.copy()

        if not self.object_columns:
            return xcopy
        
        encoded = self._encoder.transform(xcopy[self.object_columns])

        encoded_df = pd.DataFrame(
            encoded,
            columns=self._encoder.get_feature_names_out(self.object_columns),
            index=xcopy.index
        )

        xcopy.drop(columns=self.object_columns, inplace=True)
        x_final = pd.concat([xcopy, encoded_df], axis=1)
        
        print(f"Successfully encoded and added {len(encoded_df.columns)} new columns.")
        return x_final

In [123]:
class NaNImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.numerical_imputer = None
        self.categorical_imputer = None
        self.numerical_cols = []
        self.categorical_cols = []

    def fit(self, x, y=None):
        print("\n--- Fitting NaNImputer ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)
        
        self.numerical_cols = [col for col in x.select_dtypes(include=np.number) if x[col].isnull().any()]
        self.categorical_cols = [col for col in x.select_dtypes(include='object') if x[col].isnull().any()]

        if self.numerical_cols:
            print(f"Fitting KNNImputer on columns: {self.numerical_cols}")
            self.numerical_imputer = KNNImputer(n_neighbors=self.n_neighbors)
            self.numerical_imputer.fit(x[self.numerical_cols])
            
        if self.categorical_cols:
            print(f"Fitting SimpleImputer (mode) on columns: {self.categorical_cols}")
            self.categorical_imputer = SimpleImputer(strategy='most_frequent')
            self.categorical_imputer.fit(x[self.categorical_cols])

        return self
    
    def transform(self, x):
        print("\n--- Transforming with NaNImputer ---")
        if not isinstance(x, pd.DataFrame):
            x = pd.DataFrame(x)

        xcopy = x.copy()

        if self.numerical_cols and self.numerical_imputer:
            imputed_num = self.numerical_imputer.transform(xcopy[self.numerical_cols])
            xcopy[self.numerical_cols] = imputed_num
            print(f"Imputed NaNs in numerical columns: {self.numerical_cols}")
        
        if self.categorical_cols and self.categorical_imputer:
            imputed_cat = self.categorical_imputer.transform(xcopy[self.categorical_cols])
            xcopy[self.categorical_cols] = imputed_cat
            print(f"Imputed NaNs in categorical columns: {self.categorical_cols}")
        
        return xcopy

In [124]:
preprocessing_pipeline = Pipeline(steps=[
    ('nan_dropper', ColumnDropper(nan_thresh=0.5)), 
    ('outlier_capper', OutlierCapper(z_thresh=3.0)),
    ('imputer', NaNImputer(n_neighbors=5)),
    ('encoder', CategoricalTypeConverter())
])

In [125]:
processed_df = preprocessing_pipeline.fit_transform(train_df)
processed_df


--- Fitting NaNImputer ---
Fitting KNNImputer on columns: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
Fitting SimpleImputer (mode) on columns: ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

--- Transforming with NaNImputer ---
Imputed NaNs in numerical columns: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
Imputed NaNs in categorical columns: ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

--- Fitting CategoricalTypeConverter ---
Found object columns to encode: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60.0,65.0,8450.0,7.0,5.00000,2003.0,2003,196.0,706.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20.0,80.0,9600.0,6.0,8.00000,1976.0,1976,0.0,978.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60.0,68.0,11250.0,7.0,5.00000,2001.0,2002,162.0,486.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70.0,60.0,9550.0,7.0,5.00000,1915.0,1970,0.0,216.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60.0,84.0,14260.0,8.0,5.00000,2000.0,2000,350.0,655.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60.0,62.0,7917.0,6.0,5.00000,1999.0,2000,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1456,1457,20.0,85.0,13175.0,6.0,6.00000,1978.0,1988,119.0,790.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1457,1458,70.0,66.0,9042.0,7.0,8.91374,1941.0,2006,0.0,275.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1458,1459,20.0,68.0,9717.0,5.0,6.00000,1950.0,1996,0.0,49.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
def train_and_test(xtrain, ytrain, xtest, ytest, model):
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5, 6],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'n_estimators': [500, 1000, 2000],
        'gamma': [0, 0.1, 0.2]
    }

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=25,
        scoring='neg_root_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42
    )

    random_search.fit(xtrain, ytrain)
    print("\n--- Hyperparameter Tuning Results ---")
    print(f"Best parameters found: {random_search.best_params_}")

    best_model = random_search.best_estimator_
    
    ypred_log = best_model.predict(xtest)

    ypred_original = np.expm1(ypred_log)
    ytest_original = np.expm1(ytest)
    
    mse = mean_squared_error(ytest_original, ypred_original)
    rmse = np.sqrt(mse)
    
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'Root Mean Squared Error (RMSE): ${rmse:.2f}')

In [127]:
model1 = xgb.XGBRegressor(
    random_state=42,
    n_jobs=-1
)

x = processed_df.drop(columns=['SalePrice'])
y = np.log1p(processed_df['SalePrice'])

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
train_and_test(xtrain, ytrain, xtest, ytest, model1)

Fitting 5 folds for each of 25 candidates, totalling 125 fits

--- Hyperparameter Tuning Results ---
Best parameters found: {'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
Mean Squared Error (MSE): 429713329.56
Root Mean Squared Error (RMSE): $20729.53


In [135]:
model2 = xgb.XGBRegressor(
    subsample=0.6,
    n_estimators=500,
    max_depth=3,
    learning_rate=0.1,
    gamma=0,
    colsample_bytree=0.8
)
model2.fit(x, y)
test_pred = np.expm1(model2.predict(preprocessing_pipeline.transform(test_df)))


--- Transforming with NaNImputer ---
Imputed NaNs in numerical columns: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
Imputed NaNs in categorical columns: ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

--- Transforming with CategoricalTypeConverter ---
Successfully encoded and added 235 new columns.




In [136]:
submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_pred
})

submission_df.to_csv('submission.csv', index=False)

print("\n✅ Submission file 'submission.csv' created successfully!")
print(submission_df.head())


✅ Submission file 'submission.csv' created successfully!
     Id      SalePrice
0  1461  119360.367188
1  1462  151176.140625
2  1463  181911.203125
3  1464  185476.468750
4  1465  182003.703125
