In [2]:
import boto3
import pandas as pd
import os
import io
import sys

# Configuration Access Keys
current_dir = os.getcwd()
project_root_dir = os.path.dirname(current_dir)
sys.path.insert(0, project_root_dir)

from steps.s3_ingest_data import *
from steps.clean_data import *
from config.access_keys import *
import pandas as pd
import logging
if __name__ == "__main__":
    logging.info("Starting S3CSVReader...")
    reader = S3CSVReader(bucket_name=S3_BUCKET_NAME, region_name=AWS_REGION, 
                         aws_access_key_id=S3_AWS_ACCESS_KEY_ID, aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY)
    # Read the CSV file from S3
    df = reader.read_csv(s3_key=S3_KEY, encoding='utf-8')
    
    # Clean, transform, and split the data.
    processed_df = clean_data(df)
    X_train, X_val, y_train, y_val = split_data(processed_df)

2025-08-12 18:06:16,198 - root - INFO - Starting S3CSVReader...
2025-08-12 18:06:16,199 - steps.s3_ingest_data - INFO - Initializing S3 client with explicit AWS access keys.
2025-08-12 18:06:16,529 - steps.s3_ingest_data - INFO - S3CSVReader initialized for bucket 'housepred-data' in region 'us-east-1'.
2025-08-12 18:06:16,529 - steps.s3_ingest_data - INFO - Attempting to read 'House_Price_Prediction_Dataset.csv' from bucket 'housepred-data' with encoding 'utf-8'.
2025-08-12 18:06:16,738 - steps.s3_ingest_data - INFO - Successfully fetched object 'House_Price_Prediction_Dataset.csv'.
2025-08-12 18:06:16,765 - steps.s3_ingest_data - INFO - Successfully loaded 'House_Price_Prediction_Dataset.csv' into DataFrame. Shape: (2000, 10)
2025-08-12 18:06:16,765 - root - INFO - Starting data cleaning process...
2025-08-12 18:06:16,771 - root - INFO - Data preprocessing completed successfully.
2025-08-12 18:06:16,771 - root - INFO - Dividing data into training and validation sets...
2025-08-12 18:

In [3]:
# Import necessary libraries
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope

  import pkg_resources


In [6]:
# B. Define the objective function to be minimized
# Hyperopt minimizes the objective function, so we return the negative of the score
# (e.g., negative R-squared or negative mean squared error)


def objective(params):
    """
    Objective function to minimize. It trains a RandomForestRegressor with the
    given hyperparameters and returns the negative mean cross-validated R-squared score.
    """
    print(f"Evaluating with parameters: {params}")

    # Initialize the model with the given hyperparameters
    regressor = RandomForestRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=42,
        verbose=1,
        n_jobs=-1  # Use all available cores
    )

    # Use 5-fold cross-validation to evaluate the model
    # We use 'r2' as the scoring metric. A higher R-squared is better, so we
    # negate it for hyperopt to minimize.
    score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='r2').mean()

    # Hyperopt minimizes the value, so we return the negative R-squared score
    return {'loss': -score, 'status': STATUS_OK}


In [7]:
# C. Define the search space for the hyperparameters
# 'hp' functions are used to define the different types of distributions
# for hyperopt to sample from during the search.
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 10)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 25, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', 0.8, 1.0])
}

# D. Run the Bayesian optimization
# `fmin` is the main function for running the search.
# `tpe.suggest` is the tree-structured Parzen estimator algorithm for Bayesian search.
# `max_evals` is the number of different hyperparameter combinations to try.

import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name="hyper_opt_test")
mlflow.autolog(silent=True)
with mlflow.start_run():
    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=2,  # Try 50 different combinations
        trials=trials,
        rstate=np.random.default_rng(42),  # For reproducibility
        verbose=True,  # Print progress,
        show_progressbar=True  # Show progress bar,
    )

    # E. Print the results
    print("\n--- Optimization Complete ---")
    print("Best hyperparameters found:")
    # Hyperopt returns indices for `hp.choice`, so we map them back to the original values
    best_params = {
        'n_estimators': int(best['n_estimators']),
        'max_depth': int(best['max_depth']),
        'min_samples_leaf': int(best['min_samples_leaf']),
        'max_features': ['sqrt', 'log2', 0.8, 1.0][best['max_features']]
    }
    print(best_params)

    # Get the best score from the trials object
    best_score = -trials.best_trial['result']['loss']
    print(f"Best R-squared score: {best_score:.4f}")

  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]

2025-08-12 18:07:22,006 - hyperopt.tpe - INFO - build_posterior_wrapper took 0.000546 seconds
2025-08-12 18:07:22,006 - hyperopt.tpe - INFO - TPE using 0 trials


Evaluating with parameters: {'max_depth': 18, 'max_features': 'log2', 'min_samples_leaf': 7, 'n_estimators': 420}
  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:    0.1s

[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:    0.2s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 420 out of 420 | elapsed:    0.1s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 420 out of 420 | elapsed:    0.1s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n

 50%|█████     | 1/2 [00:21<00:21, 21.45s/trial, best loss: 0.020886635018933265]

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 420 out of 420 | elapsed:    0.0s finished

2025-08-12 18:07:43,453 - hyperopt.tpe - INFO - build_posterior_wrapper took 0.000632 seconds
2025-08-12 18:07:43,453 - hyperopt.tpe - INFO - TPE using 1/1 trials with best loss 0.020887


Evaluating with parameters: {'max_depth': 6, 'max_features': 1.0, 'min_samples_leaf': 6, 'n_estimators': 190}
 50%|█████     | 1/2 [00:21<00:21, 21.45s/trial, best loss: 0.020886635018933265]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=-1)]: Done 190 out of 190 | elapsed:    0.1s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 190 out of 190 | elapsed:    0.0s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 190 out of 190 | elapsed:    0.0s finished

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s

[Parallel(n_jobs=10)]: Done 190 out of 190 | elapsed:    0.0s finished



 50%|█████     | 1/2 [01:56<01:56, 116.96s/trial, best loss: 0.020886635018933265]
🏃 View run resilient-rat-478 at: http://127.0.0.1:8080/#/experiments/311994977521697831/runs/e905cb3ad6e044eabbf486fe0f3f9901
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/311994977521697831


KeyboardInterrupt: 