In [1]:
import boto3
import pandas as pd
import os
import io
import sys

# Configuration Access Keys
current_dir = os.getcwd()
project_root_dir = os.path.dirname(current_dir)
sys.path.insert(0, project_root_dir)

from steps.s3_ingest_data import *
from steps.clean_data import *
from config.access_keys import *
import pandas as pd
import logging
if __name__ == "__main__":
    logging.info("Starting S3CSVReader...")
    reader = S3CSVReader(bucket_name=S3_BUCKET_NAME, region_name=AWS_REGION, 
                         aws_access_key_id=S3_AWS_ACCESS_KEY_ID, aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY)
    # Read the CSV file from S3
    df = reader.read_csv(s3_key=S3_KEY, encoding='utf-8')
    
    # Clean, transform, and split the data.
    processed_df = clean_data(df)
    X_train, X_val, y_train, y_val = split_data(processed_df)

2025-08-07 13:42:35,253 - root - INFO - Starting S3CSVReader...
2025-08-07 13:42:35,254 - steps.s3_ingest_data - INFO - Initializing S3 client with explicit AWS access keys.
2025-08-07 13:42:35,620 - steps.s3_ingest_data - INFO - S3CSVReader initialized for bucket 'housepred-data' in region 'us-east-1'.
2025-08-07 13:42:35,621 - steps.s3_ingest_data - INFO - Attempting to read 'House_Price_Prediction_Dataset.csv' from bucket 'housepred-data' with encoding 'utf-8'.
2025-08-07 13:42:35,933 - steps.s3_ingest_data - INFO - Successfully fetched object 'House_Price_Prediction_Dataset.csv'.
2025-08-07 13:42:35,964 - steps.s3_ingest_data - INFO - Successfully loaded 'House_Price_Prediction_Dataset.csv' into DataFrame. Shape: (2000, 10)
2025-08-07 13:42:35,964 - root - INFO - Starting data cleaning process...
2025-08-07 13:42:35,972 - root - INFO - Data preprocessing completed successfully.
2025-08-07 13:42:35,973 - root - INFO - Dividing data into training and validation sets...
2025-08-07 13:

In [None]:
# Build Linear Regression and Track with MLflow
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name="test_experiment")
mlflow.autolog()
with mlflow.start_run():
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    # Initialize the Linear Regression model
    model = LinearRegression()

    # Train the model
    model.fit(X_train, y_train)

    # Model params
    hpyerparameters = model.get_params()

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Evaluate the model
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

2025/08/07 13:42:48 INFO mlflow.tracking.fluent: Experiment with name 'test_experiment' does not exist. Creating a new experiment.
2025/08/07 13:42:48 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/08/07 13:42:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/08/07 13:42:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Mean Squared Error: 78632086666.03287
R-squared: -0.010710420220792383
🏃 View run smiling-shrimp-37 at: http://127.0.0.1:8080/#/experiments/126250616536704321/runs/52732a67fa63444f907388bb6066a195
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/126250616536704321


In [4]:
# Build Random Forest Regressor and Track with MLflow
from sklearn.ensemble import RandomForestRegressor

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name="test_experiment")
mlflow.autolog()
with mlflow.start_run():

    # Initialize the Random Forest Regressor model
    model = RandomForestRegressor()

    # Train the model
    model.fit(X_train, y_train)

    # Model params
    hpyerparameters = model.get_params()

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

2025/08/07 13:43:31 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/08/07 13:43:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/08/07 13:43:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run likeable-smelt-849 at: http://127.0.0.1:8080/#/experiments/126250616536704321/runs/0a91a847b42d451db806491e6f171389
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/126250616536704321


In [5]:
# Build Gradient Boosting Regressor and Track with MLflow
from sklearn.ensemble import GradientBoostingRegressor

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name="test_experiment")
mlflow.autolog()
with mlflow.start_run():
    # Initialize the Gradient Boosting Regressor model
    model = GradientBoostingRegressor()

    # Train the model
    model.fit(X_train, y_train)

    # Model params
    hpyerparameters = model.get_params()

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

2025/08/07 13:43:37 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/08/07 13:43:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/08/07 13:43:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run adventurous-colt-292 at: http://127.0.0.1:8080/#/experiments/126250616536704321/runs/c0384a9ae02548ed85920623067352df
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/126250616536704321
