In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [29]:
!pip install dagshub
import dagshub
dagshub.init(repo_owner='dshan21', repo_name='ML_ASS_1', mlflow=True)



In [30]:
!pip install mlflow



In [31]:
import pandas as pd

train_file_path = "../input/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print(dataset_df.shape)

(1460, 81)


In [32]:
from sklearn.model_selection import train_test_split 
X_train, X_test = train_test_split(dataset_df, test_size=0.2, random_state=42)

In [33]:
# X_train.head()
# X_test.head()

In [34]:
X_train.drop(columns=["Id"], inplace=True)
X_test.drop(columns=["Id"], inplace=True)

# FEATURE CLEANING

In [35]:
# Replace NA quality/cond values with 0 or map to ordinal if needed
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, np.nan: 0}
X_train['ExterQualNum'] = X_train['ExterQual'].map(quality_map)
X_train['ExterCondNum'] = X_train['ExterCond'].map(quality_map)
X_train['BsmtQualNum'] = X_train['BsmtQual'].map(quality_map)
X_train['BsmtCondNum'] = X_train['BsmtCond'].map(quality_map)
X_train['GarageQualNum'] = X_train['GarageQual'].map(quality_map)
X_train['GarageCondNum'] = X_train['GarageCond'].map(quality_map)

X_train['OverallScore'] = X_train['OverallQual'] * X_train['OverallCond']
X_train['ExterScore'] = X_train['ExterQualNum'] + X_train['ExterCondNum']
X_train['GarageScore'] = X_train['GarageQualNum'] + X_train['GarageCondNum']
X_train['BsmtScore'] = X_train['BsmtQualNum'] + X_train['BsmtCondNum']

X_test['ExterQualNum'] = X_test['ExterQual'].map(quality_map)
X_test['ExterCondNum'] = X_test['ExterCond'].map(quality_map)
X_test['BsmtQualNum'] = X_test['BsmtQual'].map(quality_map)
X_test['BsmtCondNum'] = X_test['BsmtCond'].map(quality_map)
X_test['GarageQualNum'] = X_test['GarageQual'].map(quality_map)
X_test['GarageCondNum'] = X_test['GarageCond'].map(quality_map)

X_test['OverallScore'] = X_test['OverallQual'] * X_test['OverallCond']
X_test['ExterScore'] = X_test['ExterQualNum'] + X_test['ExterCondNum']
X_test['GarageScore'] = X_test['GarageQualNum'] + X_test['GarageCondNum']
X_test['BsmtScore'] = X_test['BsmtQualNum'] + X_test['BsmtCondNum']

In [36]:
# X_train.info()
# X_test.info()

In [37]:
# dataset_df.fillna(dataset_df.mean(), inplace=True)
# X_train.isna().mean()

In [38]:
# X_test.isna().mean()

In [39]:
import json
import mlflow
import mlflow.sklearn
mlflow.set_experiment("House_Price_Regression_Object_OHE")
mlflow.start_run()


allFeatures = []
highest_value_mapping = {}
for colName, dataType in X_train.dtypes.items():
    allFeatures.append(colName)
    if dataType not in ['int64', 'float64']:
        # print(dataType, "CAUGHT")
        # print(dataset_df.groupby(colName).agg({'SalePrice': 'mean'}).sort_values(by='SalePrice', ascending=False))
        highest_value_row = X_train.groupby(colName)['SalePrice'].mean().idxmax()
        highest_value_mapping[colName] = highest_value_row
        # print(highest_value_row)
        X_train[colName + '_num'] = np.where(X_train[colName] == highest_value_row, 1, 0)
        X_train.drop(columns=[colName], inplace=True)
        # print(colName, dataType)

        highest_value_row = X_test.groupby(colName)['SalePrice'].mean().idxmax()
        # print(highest_value_row)
        X_test[colName + '_num'] = np.where(X_test[colName] == highest_value_row, 1, 0)
        X_test.drop(columns=[colName], inplace=True)
    # else:
    #     print(dataType, "LET THROUGH")


with open("highest_value_mapping.json", "w") as f:
    json.dump(highest_value_mapping, f)

# Log the JSON file in MLflow
mlflow.log_artifact("highest_value_mapping.json")
mlflow.end_run()


🏃 View run fortunate-mule-513 at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/2/runs/310b075f53804f80899273b2bdd7a384
🧪 View experiment at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/2


In [40]:
X_train.fillna(X_train.mode().iloc[0], inplace=True)
X_test.fillna(X_test.mode().iloc[0], inplace=True)

In [41]:
# X_train.isna().mean()

In [42]:
# X_test.isna().mean()

In [43]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# Feature Engineering

In [44]:

X_train['TotalSF'] = X_train['1stFlrSF'] + X_train['2ndFlrSF'] + X_train['TotalBsmtSF']
X_test['TotalSF'] = X_test['1stFlrSF'] + X_test['2ndFlrSF'] + X_test['TotalBsmtSF']

X_train['TotalBath'] = X_train['FullBath'] + (0.5 * X_train['HalfBath']) + X_train['BsmtFullBath'] + (0.5 * X_train['BsmtHalfBath'])
X_test['TotalBath'] = X_test['FullBath'] + (0.5 * X_test['HalfBath']) + X_test['BsmtFullBath'] + (0.5 * X_test['BsmtHalfBath'])

X_train['AgeAtSale'] = X_train['YrSold'] - X_train['YearBuilt']
X_train['YearsSinceRemodel'] = X_train['YrSold'] - X_train['YearRemodAdd']
X_train['GarageAge'] = X_train['YrSold'] - X_train['GarageYrBlt']
X_train['GarageAge'] = X_train['GarageAge'].fillna(-1)

X_test['AgeAtSale'] = X_test['YrSold'] - X_test['YearBuilt']
X_test['YearsSinceRemodel'] = X_test['YrSold'] - X_test['YearRemodAdd']
X_test['GarageAge'] = X_test['YrSold'] - X_test['GarageYrBlt']
X_test['GarageAge'] = X_test['GarageAge'].fillna(-1)



X_train['TotalPorchSF'] = (
    X_train['OpenPorchSF'] +
    X_train['EnclosedPorch'] +
    X_train['3SsnPorch'] +
    X_train['ScreenPorch']
)

X_train['HasPorch'] = (X_train['TotalPorchSF'] > 0).astype(int)
X_train['HasDeck'] = (X_train['WoodDeckSF'] > 0).astype(int)



X_test['TotalPorchSF'] = (
    X_test['OpenPorchSF'] +
    X_test['EnclosedPorch'] +
    X_test['3SsnPorch'] +
    X_test['ScreenPorch']
)

X_test['HasPorch'] = (X_test['TotalPorchSF'] > 0).astype(int)
X_test['HasDeck'] = (X_test['WoodDeckSF'] > 0).astype(int)


X_train['HasBasement'] = (X_train['TotalBsmtSF'] > 0).astype(int)
X_train['TotalFinishedBsmtSF'] = X_train['BsmtFinSF1'] + X_train['BsmtFinSF2']
X_train['TotalUsableBsmtSF'] = X_train['TotalFinishedBsmtSF'] + X_train['BsmtUnfSF']

X_test['HasBasement'] = (X_test['TotalBsmtSF'] > 0).astype(int)
X_test['TotalFinishedBsmtSF'] = X_test['BsmtFinSF1'] + X_test['BsmtFinSF2']
X_test['TotalUsableBsmtSF'] = X_test['TotalFinishedBsmtSF'] + X_test['BsmtUnfSF']

X_train['HasGarage'] = (X_train['GarageArea'] > 0).astype(int)
X_train['GarageCapacityValue'] = X_train['GarageCars'] * X_train['GarageArea']
X_train['GarageCapacityValue'] = X_train['GarageCapacityValue'].fillna(0)

X_test['HasGarage'] = (X_test['GarageArea'] > 0).astype(int)
X_test['GarageCapacityValue'] = X_test['GarageCars'] * X_test['GarageArea']
X_test['GarageCapacityValue'] = X_test['GarageCapacityValue'].fillna(0)





# X_train.drop(columns=["1stFlrSF", "2ndFlrSF", "TotalBsmtSF", "FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"], inplace=True)
# X_test.drop(columns=["1stFlrSF", "2ndFlrSF", "TotalBsmtSF", "FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"], inplace=True)


# Feature Selection

In [45]:
mlflow.set_experiment("House_Price_Regression_Correlation")
mlflow.start_run()

corr_with_target = pd.DataFrame(X_train.corr()['SalePrice'].sort_values(ascending=False))
print("Top correlations with SalePrice:")
# print(corr_with_target.head(15))

high_corr_features = corr_with_target[corr_with_target['SalePrice'] > 0.5].index.tolist()

print(f"Selected {len(high_corr_features)} features based on correlation")
print(high_corr_features)



high_corr_features_dict = {}
for feat in high_corr_features:
    high_corr_features_dict[feat] = 1

with open('my_list.json', "w") as f:
    json.dump(high_corr_features_dict, f)

mlflow.log_artifact("my_list.json")
mlflow.end_run()


X_train = X_train[high_corr_features]
X_test = X_test[high_corr_features]


X_train.info()
X_test.info()

Top correlations with SalePrice:
Selected 21 features based on correlation
['SalePrice', 'OverallQual', 'TotalSF', 'GrLivArea', 'GarageCapacityValue', 'ExterQualNum', 'GarageCars', 'GarageArea', 'TotalBath', 'TotalUsableBsmtSF', 'TotalBsmtSF', 'BsmtQualNum', '1stFlrSF', 'ExterScore', 'OverallScore', 'FullBath', 'BsmtQual_num', 'TotRmsAbvGrd', 'YearBuilt', 'KitchenQual_num', 'YearRemodAdd']
🏃 View run gentle-trout-795 at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/3/runs/9ebf5f6e56294ee381115126237132bf
🧪 View experiment at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SalePrice            1168 non-null   int64  
 1   OverallQual          1168 non-null   int64  
 2   TotalSF              1168 non-null   int64  
 3   GrLivArea            1168 non-nul

# TRAINING

In [46]:
y_train = X_train.pop('SalePrice')
y_test = X_test.pop('SalePrice')

In [47]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge())
])

scalers = [
    StandardScaler(),
    MinMaxScaler(),
    None
]

regressors = [
    Ridge(random_state=42),
    Lasso(random_state=42),
    ElasticNetCV(cv=5),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
    XGBRegressor(random_state=42, verbosity=0)
]

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = [
    {
        'scaler': scalers,
        'regressor': [Ridge(random_state=42), Lasso(random_state=42)],
        'regressor__alpha': [0.01, 0.1, 1.0, 10.0]
    },
    {
        'scaler': scalers,
        'regressor': [ElasticNetCV(cv=5)]
    },
    {
        'scaler': scalers,
        'regressor': [RandomForestRegressor(random_state=42), GradientBoostingRegressor(random_state=42)],
    },
    {
        'scaler': scalers,
        'regressor': [XGBRegressor(random_state=42, verbosity=0)],
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [3, 5],
        'regressor__learning_rate': [0.01, 0.1]
    }
]


In [48]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring='neg_mean_squared_error',
    verbose=2,
    return_train_score=True,
    n_jobs=-1
)

In [49]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 57 candidates, totalling 285 fits


In [50]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...), 'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200, 'scaler': StandardScaler()}
Best cross-validation score: -849756662.1708


In [51]:
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values('rank_test_score')

In [52]:
best_pipeline = grid_search.best_estimator_

In [53]:
print(X_train.dtypes)

OverallQual              int64
TotalSF                  int64
GrLivArea                int64
GarageCapacityValue      int64
ExterQualNum             int64
GarageCars               int64
GarageArea               int64
TotalBath              float64
TotalUsableBsmtSF        int64
TotalBsmtSF              int64
BsmtQualNum              int64
1stFlrSF                 int64
ExterScore               int64
OverallScore             int64
FullBath                 int64
BsmtQual_num             int64
TotRmsAbvGrd             int64
YearBuilt                int64
KitchenQual_num          int64
YearRemodAdd             int64
dtype: object


In [54]:
mlflow.set_experiment("House_Price_Regression")
with mlflow.start_run(run_name="grid_search_house_prices"):
    run_id = mlflow.active_run().info.run_id
    print(f"MLflow Run ID: {run_id}")
    
    mlflow.log_params({
        "cv_folds": kfold.n_splits,
        "scoring_metric": "neg_mean_squared_error",
        "random_state": 42,
        "dataset_size": X_train.shape
    })
    
    print("Starting grid search...")
    grid_search.fit(X_train, y_train)
    
    mlflow.log_param("best_params", grid_search.best_params_)
    best_score = -grid_search.best_score_
    rmse = np.sqrt(best_score)
    mlflow.log_metric("best_cv_mse", best_score)
    mlflow.log_metric("best_cv_rmse", rmse)
    
    best_pipeline = grid_search.best_estimator_
    
    test_preds = best_pipeline.predict(X_test)
    test_mse = mean_squared_error(y_test, test_preds)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, test_preds)
    
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV RMSE: {rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Test R²: {test_r2:.4f}")
    
    mlflow.sklearn.log_model(
        best_pipeline, 
        "best_house_price_model",
        registered_model_name="HousePriceRegressionModel"
    )
    
    results = pd.DataFrame(grid_search.cv_results_)
    results = results.sort_values('rank_test_score')
    
    top_results = results.head(10)
    top_results.to_csv('top_model_results.csv')
    mlflow.log_artifact('top_model_results.csv')


MLflow Run ID: 99ab3c17d097475eadb6409503bdec5b
Starting grid search...
Fitting 5 folds for each of 57 candidates, totalling 285 fits
Best parameters: {'regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...), 'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200, 's

Registered model 'HousePriceRegressionModel' already exists. Creating a new version of this model...
2025/04/09 17:24:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: HousePriceRegressionModel, version 20
Created version '20' of model 'HousePriceRegressionModel'.


🏃 View run grid_search_house_prices at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/0/runs/99ab3c17d097475eadb6409503bdec5b
🧪 View experiment at: https://dagshub.com/dshan21/ML_ASS_1.mlflow/#/experiments/0
