In [2]:
import awswrangler as wr
import numpy as np
import mlflow

%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


# Hyperparams tuning


In [3]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

In [4]:
X_train = wr.s3.read_csv("s3://data/final/train/bigmart_X_train.csv")
y_train = wr.s3.read_csv("s3://data/final/train/bigmart_y_train.csv")

X_test = wr.s3.read_csv("s3://data/final/test/bigmart_X_test.csv")
y_test = wr.s3.read_csv("s3://data/final/test/bigmart_y_test.csv")

In [5]:
from plots import plot_correlation_with_target

target_col = y_train.columns[0]
correlation_plot = plot_correlation_with_target(X_train, y_train, target_col=target_col)

In [6]:
import datetime
import optuna

from mlflow.models import infer_signature
from mlflow_aux import get_or_create_experiment

from sklearn.metrics import mean_squared_error, r2_score

optuna.logging.set_verbosity(optuna.logging.ERROR)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score


def objective(trial, X, y):
    model_type = trial.suggest_categorical(
        "model_type", ["Linear", "Ridge", "Lasso", "ElasticNet"]
    )

    if model_type == "Linear":
        model = LinearRegression()
    elif model_type == "Ridge":
        alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
        model = Ridge(alpha=alpha)
    elif model_type == "Lasso":
        alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
        model = Lasso(alpha=alpha)
    else:
        alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
        l1_ratio = trial.suggest_uniform("l1_ratio", 0, 1)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)

    scores = cross_val_score(
        model, X, y.values.ravel(), cv=5, scoring="neg_mean_squared_error"
    )
    rmse = np.sqrt(-scores.mean())
    return rmse

In [8]:
experiment_id = get_or_create_experiment("BigMart Sales")
print(f"Experiment ID: {experiment_id}")


run_name_parent = "best_linear_model_" + datetime.datetime.now().strftime(
    "%Y%m%d-%H%M%S"
)

Experiment ID: 1


In [9]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent):
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=50)

    best_params = study.best_params
    mlflow.log_params(best_params)
    mlflow.log_metric("best_rmse", study.best_value)

    if best_params.get("model_type") == "Linear":
        best_model = LinearRegression()
    elif best_params.get("model_type") == "Ridge":
        best_model = Ridge(alpha=best_params["alpha"])
    elif best_params.get("model_type") == "Lasso":
        best_model = Lasso(alpha=best_params["alpha"])
    else:
        best_model = ElasticNet(
            alpha=best_params["alpha"], l1_ratio=best_params["l1_ratio"]
        )

    best_model.fit(X_train, y_train.values.ravel())

    y_pred = best_model.predict(X_test)
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    test_r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)

    mlflow.log_figure(correlation_plot, "correlation_plot.png")

    artifact_path = "model"
    signature = infer_signature(X_train, best_model.predict(X_train))
    mlflow.sklearn.log_model(
        best_model,
        artifact_path,
        signature=signature,
        registered_model_name="bigmart_sales_model_dev",
    )

    model_uri = mlflow.get_artifact_uri(artifact_path)

  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  l1_ratio = trial.suggest_uniform("l1_ratio", 0, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  l1_ratio = trial.suggest_uniform("l1_ratio", 0, 1)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  l1_ratio = trial.suggest_uniform("l1_ratio", 0, 1)
  alpha = trial.suggest_loguniform("alpha", 1e-5, 1)
  l1_ratio = trial.suggest_uniform("l1_ratio", 0, 1)
  alpha = trial.suggest_loguniform("alpha",

## Model testing


In [10]:
loaded_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts: 100%|██████████| 9/9 [00:00<00:00, 1410.43it/s]


In [11]:
y_pred = loaded_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {test_rmse}")
print(f"Test R^2: {test_r2}")

sample_data = X_test.iloc[0:1, :]
sample_prediction = loaded_model.predict(sample_data)
print(f"Sample Prediction: {sample_prediction}")
print(sample_data)

Test RMSE: 1155.4416116643067
Test R^2: 0.3976157403072126
Sample Prediction: [1168.06928302]
   Item_Weight  Item_Visibility  Item_MRP  Outlet_Establishment_Year  \
0     -1.43105         -1.08861         1                       2004   

   Outlet_Size  Outlet_Location_Type  Outlet_Type_Supermarket Type1  \
0            0                     1                           True   

   Outlet_Type_Supermarket Type2  
0                          False  




In [12]:
sample_data

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2
0,-1.43105,-1.08861,1,2004,0,1,True,False


## Model registering


In [13]:
from mlflow import MlflowClient

client = MlflowClient()
name = "bigmart_sales_model_prod"
desc = "This regressor predicts sales for BigMart products"

client.create_registered_model(name=name, description=desc)

tags = best_model.get_params()
tags["model"] = type(best_model).__name__
tags["rmse"] = test_rmse

result = client.create_model_version(
    name=name, source=model_uri, run_id=model_uri.split("/")[-3], tags=tags
)


client.set_registered_model_alias(name, "champion", result.version)

2024/06/22 00:26:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bigmart_sales_model_prod, version 1
