In [7]:
import os
import numpy as np
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

In [2]:
os.getenv("MLFLOW_TRACKING_URI")

'http://mlflow-service:5000'

In [3]:
# Получим датасет California housing
housing = datasets.fetch_california_housing(as_frame=True)
# Объединим фичи и таргет в один np.array
data = pd.concat([housing["data"], pd.DataFrame(housing["target"])], axis=1)

In [4]:
FEATURES = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup",
    "Latitude", "Longitude"
]
TARGET = "MedHouseVal"

In [5]:
# Сделать препроцессинг
# Разделить на фичи и таргет
X, y = data[FEATURES], data[TARGET]

# Разделить данные на обучение и тест
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Обучить стандартизатор на train
scaler = StandardScaler()
X_train_fitted = scaler.fit_transform(X_train)
X_test_fitted = scaler.transform(X_test)

# Обучить стандартизатор на train
scaler = StandardScaler()
X_train_fitted = scaler.fit_transform(X_train)
X_test_fitted = scaler.transform(X_test)

In [6]:
name = "MedHouseExp_2"
experiment_id = mlflow.create_experiment(name)
mlflow.set_experiment(experiment_id)

2025/04/16 12:13:55 INFO mlflow.tracking.fluent: Experiment with name '134601518219341040' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-webinar-example/mlflow/499020991183303194', creation_time=1744805635163, experiment_id='499020991183303194', last_update_time=1744805635163, lifecycle_stage='active', name='134601518219341040', tags={}>

In [8]:
models = dict(zip(["RandomForest", "LinearRegression", "HistGB"], 
                  [RandomForestRegressor(), LinearRegression(), HistGradientBoostingRegressor()]))

In [9]:
def train_model(model, name, X_train, X_test, y_train, y_test):

    # Обучить модель
    model.fit(X_train, y_train)

    # Сделать predict
    prediction = model.predict(X_test)

    # Получить описание данных
    signature = infer_signature(X_test, prediction)
    # Сохранить модель в артифактори
    model_info = mlflow.sklearn.log_model(model, name, signature=signature)
    # Сохранить метрики модели
    mlflow.evaluate(
        model_info.model_uri,
        data=X_test,
        targets=y_test.values,
        model_type="regressor",
        evaluators=["default"],
    )

In [13]:
with mlflow.start_run(run_name="Parent_Run", experiment_id=experiment_id, description="Example description") as parent_run:
    for model in models.keys():
        with mlflow.start_run(run_name=model, experiment_id=experiment_id, nested=True) as child_run:
            train_model(models[model], model, X_train_fitted, X_test_fitted, y_train, y_test)



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2025/04/16 12:29:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: http://mlflow-service:5000/#/experiments/134601518219341040/runs/cd844fc8f0ad4abc9d286b59b8db94a1.
2025/04/16 12:29:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/134601518219341040.
2025/04/16 12:29:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run Parent_Run at: http://mlflow-service:5000/#/experiments/134601518219341040/runs/4c0662b5f2a547aeb09008769f162f0f.
2025/04/16 12:29:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow-service:5000/#/experiments/134601518219341040.


OSError: [Errno 28] No space left on device: '/tmp/tmpg2ygki9o'