In [1]:
!pip install dagshub mlflow

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting dagshub-annotation-converter>=0.1.5 (from dagshub)
  Downloading dagshub_annotation_converter-0.1.8-py3-none-any.whl.metadata (2.5 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from ml

In [2]:
import dagshub
dagshub.init(repo_owner='zeliz22', repo_name='ML_House-Pricing', mlflow=True)




Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=e408d233-596c-4b2b-97fd-7703fb4ee1d1&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=41d044a92283a92791d4ba34c34cca5470f59adb7b2d3a42734ea13764024ab9




Output()

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [4]:
df =  pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
col_with_nulls = list(col for col in X_train.columns if X_train[col].isna().sum() >0)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from typing import Dict, Optional, Union

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 numeric_strategy: str = 'mean',
                 categorical_strategy: str = 'most_frequent',
                 numeric_fill_value: Optional[Union[int, float]] = None,
                 categorical_fill_value: Optional[str] = None,
                 drop_threshold: float = 0.8):

        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.numeric_fill_value = numeric_fill_value
        self.categorical_fill_value = categorical_fill_value
        self.drop_threshold = drop_threshold
        self.numeric_impute_values_ = {}
        self.categorical_impute_values_ = {}
        self.columns_to_drop_ = []

    def fit(self, X: pd.DataFrame, y=None):
        """Learn imputation values from the data"""
        
        # Identify columns to drop
        null_ratios = X.isnull().mean()
        self.columns_to_drop_ = list(null_ratios[null_ratios > self.drop_threshold].index)
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Calculate numeric imputation values
        for col in numeric_cols:
            if self.numeric_strategy == 'mean':
                self.numeric_impute_values_[col] = X_clean[col].mean()
            elif self.numeric_strategy == 'median':
                self.numeric_impute_values_[col] = X_clean[col].median()
            elif self.numeric_strategy == 'constant':
                if self.numeric_fill_value is None:
                    raise ValueError("numeric_fill_value must be specified for constant strategy")
                self.numeric_impute_values_[col] = self.numeric_fill_value
            elif self.numeric_strategy != 'drop':
                raise ValueError(f"Unknown numeric strategy: {self.numeric_strategy}")
        
        # Calculate categorical imputation values
        for col in categorical_cols:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_impute_values_[col] = X_clean[col].mode()[0]
            elif self.categorical_strategy == 'constant':
                if self.categorical_fill_value is None:
                    raise ValueError("categorical_fill_value must be specified for constant strategy")
                self.categorical_impute_values_[col] = self.categorical_fill_value
            elif self.categorical_strategy != 'drop':
                raise ValueError(f"Unknown categorical strategy: {self.categorical_strategy}")
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the learned imputation to new data"""
        
        # Drop high-null columns
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Apply numeric imputation
        for col in numeric_cols:
            if col in self.numeric_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.numeric_impute_values_[col])
            elif self.numeric_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        # Apply categorical imputation
        for col in categorical_cols:
            if col in self.categorical_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.categorical_impute_values_[col])
            elif self.categorical_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        return X_clean

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Fit and transform in one step"""
        return self.fit(X, y).transform(X)

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

class CustomEncoder:
    def __init__(self, threshold = 3):
        self.threshold = threshold
        
        # Initialize encoders
        self.one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        
        # Store feature names for one-hot encoding
        self.one_hot_feature_names = None
        
    def fit(self, X, y=None):

        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        s = X[cat_cols].nunique()

        self.ordinal_cols = list(s[s > self.threshold].index)
        self.one_hot_cols = list(s[s <= self.threshold].index)

        if self.one_hot_cols:
            self.one_hot_encoder.fit(X[self.one_hot_cols])
            self.one_hot_feature_names = self.one_hot_encoder.get_feature_names_out(self.one_hot_cols)
        
        if self.ordinal_cols:
            self.ordinal_encoder.fit(X[self.ordinal_cols])
        
        return self
    
    def transform(self, X):

        X_transformed = X.copy()
        
        # Apply One-Hot Encoding
        if self.one_hot_cols:
            one_hot_encoded = self.one_hot_encoder.transform(X[self.one_hot_cols])
            one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.one_hot_feature_names, index=X.index)
            X_transformed = pd.concat([X_transformed, one_hot_df], axis=1)
            X_transformed.drop(self.one_hot_cols, axis=1, inplace=True)
        
        # Apply Ordinal Encoding
        if self.ordinal_cols:
            ordinal_encoded = self.ordinal_encoder.transform(X[self.ordinal_cols])
            ordinal_df = pd.DataFrame(ordinal_encoded, columns=self.ordinal_cols, index=X.index)
            X_transformed[self.ordinal_cols] = ordinal_df
        
        return X_transformed
    
    def fit_transform(self, X, y = None):
        return self.fit(X).transform(X)

In [9]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CorrelationFeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.features_to_drop = []
        self.high_corr_pairs = []
        
    def fit(self, X, y): 
        X_corr = X.copy()
        X_corr['SalesPrice'] = y
        corr_matrix = X_corr.corr().abs()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                
                if corr_matrix.iloc[i, j] > self.threshold:
                    self.high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
                    
        for feat1, feat2, _ in self.high_corr_pairs:
            if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
                self.features_to_drop.append(feat1)
            else:
                self.features_to_drop.append(feat2)
        
        self.features_to_drop = list(set(self.features_to_drop))
        return self

    
    def transform(self, X):
      return X.drop(columns=self.features_to_drop)
        
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
    

In [10]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, make_scorer
# 1. Define evaluation metrics
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
def log_rmse(y_true, y_pred):
    return rmse(np.log1p(y_true), np.log1p(y_pred))
# 2. Create custom scorer
log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)
# 3. Set up K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# 4. Define pipeline
pipeline_linear = Pipeline([
    ('cleaner', DataCleaner(numeric_strategy = 'median')),
    ('encoder', CustomEncoder(threshold=3)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
# 5. Define parameter grid
param_grid = {
    'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler(), None]
}
# 6. Set up GridSearchCV with K-Fold
grid_search = GridSearchCV(
    estimator=pipeline_linear,
    param_grid=param_grid,
    cv=kfold,  # Using our K-Fold here
    scoring=log_rmse_scorer,
    refit=True,
    verbose=2,
    n_jobs=-1  # Use all available cores
)
# 7. Fit the model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
# Training set evaluation
train_preds = best_model.predict(X_train)
train_log_rmse = log_rmse(y_train, train_preds)
print(f"\nTraining Log RMSE: {train_log_rmse:.4f}")
# Test set evaluation
test_preds = best_model.predict(X_test)
test_log_rmse = log_rmse(y_test, test_preds)
print(f"\nTest Log RMSE: {test_log_rmse:.4f}")
best_param = grid_search.best_params_
print(f"Best parameters: {best_param}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_13/1305265050.py", line 12, in log_rmse
  File "/tmp/ipykernel_13/1305265050.py", line 10, in rmse
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
                                          ^^^^^^^^^^^^^^^^^^^
  File "/usr/


Training Log RMSE: 0.1492

Test Log RMSE: 0.1859
Best parameters: {'scaler': RobustScaler()}


In [11]:
import mlflow

experiment_name = 'Linear Model'
run_name = 'LinearRegression using median for numericals'

mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name=run_name):
    mlflow.log_params(best_param)
    mlflow.log_metric("train_log_rmse", train_log_rmse)
    mlflow.log_metric("test_log_rmse", test_log_rmse)
    mlflow.sklearn.log_model(best_model, "linear_model")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("cv_strategy", "KFold-5")
    mlflow.log_param("param_grid", str(param_grid))



🏃 View run LinearRegression using median for numericals at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/4/runs/7bd48323f83c44339aeb8ddedf2e1b25
🧪 View experiment at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/4


In [12]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor

# 1. Define evaluation metrics
def log_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

# 2. Create custom scorer
log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

# 3. Set up K-Fold cross-validation
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# 4. Updated pipeline using RandomForest
pipeline_random_forest = Pipeline([
    ('cleaner', DataCleaner()),
    ('encoder', CustomEncoder(threshold=3)),
    ('correlation_dropper', CorrelationFeatureDropper(threshold=0.8)),
    ('feature_selector', RFE(
        estimator=RandomForestRegressor(
            n_estimators=50,
            max_depth=5,
            random_state=42,
            n_jobs=2
        ),
        step=1,
        n_features_to_select=15
    )),
    ('model', RandomForestRegressor(
        random_state=42,
        n_jobs=2
    ))
])

# 5. Corrected parameter grid (using double underscores)
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10],
    'model__max_features': ['sqrt']
}

# 6. GridSearch setup
grid_search = GridSearchCV(
    estimator=pipeline_random_forest,
    param_grid=param_grid,
    cv=kfold,
    scoring=log_rmse_scorer,
    refit=True,
    verbose=3,
    n_jobs=2
)

print("Starting optimized grid search...")
grid_search.fit(X_train, y_train)
print("Grid search completed!")

# 7. Results output (fixed attribute names)
best_model = grid_search.best_estimator_
train_preds = best_model.predict(X_train)
train_log_rmse = log_rmse(y_train, train_preds)
print(f"\nTraining Log RMSE: {train_log_rmse:.4f}")

test_preds = best_model.predict(X_test)
test_log_rmse = log_rmse(y_test, test_preds)
print(f"Test Log RMSE: {test_log_rmse:.4f}")
print(f"Best parameters: {grid_search.best_params_}")

Starting optimized grid search...
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.2s
[CV] END ..............................scaler=RobustScaler(); total time=   0.2s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ........................................scaler=None; total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.2s
[CV] END ..............................scaler=RobustScaler(); total time=   0.2s
[CV] END ..............................scaler=RobustScaler(); total time=   0.2s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ........................................scaler=None; total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ..............................scaler=RobustScaler(); total time=  

In [13]:
import mlflow

experiment_name = 'Random Forest Regressor Model'
run_name = 'RandomForestRegressor with both RFE and correlation_dropper'

mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name=run_name):
    mlflow.log_params(best_param)
    mlflow.log_metric("train_log_rmse", train_log_rmse)
    mlflow.log_metric("test_log_rmse", test_log_rmse)
    mlflow.sklearn.log_model(best_model, "linear_model")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("cv_strategy", "KFold-5")
    mlflow.log_param("param_grid", str(param_grid))



🏃 View run RandomForestRegressor with both RFE and correlation_dropper at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/5/runs/c387ed3f922f4b5c85ffd68f987e570d
🧪 View experiment at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/5


In [14]:
import numpy as np
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, make_scorer

# 1. Define evaluation metrics
def log_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

# 2. Create custom scorer
log_rmse_scorer = make_scorer(log_rmse, greater_is_better=False)

# 3. Set up K-Fold cross-validation (reduced to 3 folds)
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# 4. Simplified pipeline
pipeline_XGB = Pipeline([
    ('cleaner', DataCleaner()),
    ('encoder', CustomEncoder(threshold=3)),
    ('correlation_dropper', CorrelationFeatureDropper(threshold=0.8)),
    ('feature_selector', RFE(
        estimator=xgb.XGBRegressor(
            n_estimators=30,  # Reduced from 50
            max_depth=3,     # Reduced depth
            random_state=42,
            n_jobs=1
        ),
        step=1,
        n_features_to_select=15  # Fixed value for initial run
    )),
    ('model', xgb.XGBRegressor(
        random_state=42,
        n_jobs=2,  # Reduced parallelization
        objective='reg:squarederror'
    ))
])

# 5. Corrected parameter grid (using proper double underscores)
param_grid = {
    'model__n_estimators': [100],      # Fixed: model__n_estimators
    'model__learning_rate': [0.05],    # Fixed: model__learning_rate
    'model__max_depth': [5],           # Fixed: model__max_depth
    'model__subsample': [0.8]          # Fixed: model__subsample
}

# 6. Faster GridSearch configuration
grid_search = GridSearchCV(
    estimator=pipeline_XGB,
    param_grid=param_grid,
    cv=kfold,
    scoring=log_rmse_scorer,
    refit=True,
    verbose=3,  # More detailed progress
    n_jobs=2    # Further reduced parallelization
)

print("Starting optimized grid search...")
grid_search.fit(X_train, y_train)
print("Grid search completed!")

# 7. Results output (fixed attribute names)
best_model = grid_search.best_estimator_  # Fixed: best_estimator_
train_preds = best_model.predict(X_train)
train_log_rmse = log_rmse(y_train, train_preds)
print(f"\nTraining Log RMSE: {train_log_rmse:.4f}")

test_preds = best_model.predict(X_test)
test_log_rmse = log_rmse(y_test, test_preds)
print(f"Test Log RMSE: {test_log_rmse:.4f}")
print(f"Best parameters: {grid_search.best_params_}")  # Fixed: best_params_

Starting optimized grid search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Grid search completed!

Training Log RMSE: 0.0954
Test Log RMSE: 0.1478
Best parameters: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__subsample': 0.8}


In [15]:
import mlflow

experiment_name = 'XGBRegressor Model'
run_name = 'XGBRegressor with both: RFE and correlation_dropper'

mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name=run_name):
    mlflow.log_params(best_param)
    mlflow.log_metric("train_log_rmse", train_log_rmse)
    mlflow.log_metric("test_log_rmse", test_log_rmse)
    mlflow.sklearn.log_model(best_model, "linear_model")
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("cv_strategy", "KFold-5")
    mlflow.log_param("param_grid", str(param_grid))



🏃 View run XGBRegressor with both: RFE and correlation_dropper at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/6/runs/3020c68a52a74779a84fbf5c044d8a7c
🧪 View experiment at: https://dagshub.com/zeliz22/ML_House-Pricing.mlflow/#/experiments/6
