Настройка окружения и поднятие Tracking Server и Model Registry

In [2]:
import os
import mlflow
from dotenv import load_dotenv

load_dotenv()

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

Взятие данных, на которых обучалась самая первая модель

In [3]:
import psycopg
import pandas as pd

TABLE_NAME = "clean_flats_full_info"

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD")
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

initial_df = pd.DataFrame(data, columns=columns)
initial_df.head(2) 
initial_df.shape

(111438, 20)

In [4]:
y = initial_df['target']
x = initial_df.drop(['id', 'price', 'building_id', 'target'], axis=1)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

Обучение модели

In [6]:

from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from category_encoders import CatBoostEncoder
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor


binary_cat_features = X_train[['studio', 'is_apartment', 'has_elevator']]
other_cat_features = X_train[['building_type_int']]
num_features = X_train.select_dtypes(['float'])

preprocessor = ColumnTransformer(
    [
    ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
    ('cat', CatBoostEncoder(return_df=False), other_cat_features.columns.tolist()),
    ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

model = CatBoostRegressor(loss_function='RMSE')

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ],
    verbose=False
)


pipeline.fit(x, y) 


Learning rate set to 0.086221
0:	learn: 4644429.3051424	total: 191ms	remaining: 3m 10s
1:	learn: 4448730.1355351	total: 255ms	remaining: 2m 7s
2:	learn: 4272980.3219782	total: 283ms	remaining: 1m 34s
3:	learn: 4109636.8160604	total: 307ms	remaining: 1m 16s
4:	learn: 3972864.5250887	total: 346ms	remaining: 1m 8s
5:	learn: 3854783.9347811	total: 378ms	remaining: 1m 2s
6:	learn: 3747565.2201052	total: 414ms	remaining: 58.8s
7:	learn: 3646661.9868296	total: 444ms	remaining: 55.1s
8:	learn: 3560514.9210432	total: 474ms	remaining: 52.2s
9:	learn: 3488839.7407046	total: 498ms	remaining: 49.3s
10:	learn: 3425832.2952293	total: 527ms	remaining: 47.4s
11:	learn: 3364669.5221480	total: 569ms	remaining: 46.9s
12:	learn: 3309670.3324935	total: 623ms	remaining: 47.3s
13:	learn: 3251613.2057629	total: 663ms	remaining: 46.7s
14:	learn: 3202077.0079585	total: 683ms	remaining: 44.8s
15:	learn: 3165052.5470155	total: 711ms	remaining: 43.7s
16:	learn: 3125473.6823777	total: 748ms	remaining: 43.2s
17:	lear

0.02120854295068797

Оценка модели и вычисление метрик

In [7]:
from sklearn.model_selection import StratifiedKFold, cross_validate

cv_strategy = StratifiedKFold(n_splits=5)
cv_res = cross_validate(
        model,
        x,
        y,
        cv=cv_strategy,
        n_jobs=-1,
        scoring=['neg_root_mean_squared_error', 'r2']
        )
for key, value in cv_res.items():
    cv_res[key] = round(value.mean(), 3)

print(cv_res)



Learning rate set to 0.083234
Learning rate set to 0.083234
0:	learn: 4657560.2126359	total: 85.3ms	remaining: 1m 25s
0:	learn: 4652829.2629840	total: 89.3ms	remaining: 1m 29s
1:	learn: 4472315.3713683	total: 118ms	remaining: 58.8s
1:	learn: 4450870.8622106	total: 122ms	remaining: 1m
2:	learn: 4293046.4568701	total: 162ms	remaining: 53.8s
2:	learn: 4278507.9520468	total: 164ms	remaining: 54.5s
3:	learn: 4147442.4400676	total: 212ms	remaining: 52.7s
3:	learn: 4127833.6002510	total: 216ms	remaining: 53.8s
4:	learn: 4017907.0079418	total: 253ms	remaining: 50.4s
4:	learn: 3989190.8136114	total: 278ms	remaining: 55.3s
5:	learn: 3895043.6361427	total: 303ms	remaining: 50.2s
5:	learn: 3865162.9334941	total: 314ms	remaining: 52s
6:	learn: 3781375.0128851	total: 339ms	remaining: 48.1s
6:	learn: 3757834.2428921	total: 354ms	remaining: 50.3s
7:	learn: 3690847.9700259	total: 372ms	remaining: 46.1s
7:	learn: 3662020.2407738	total: 389ms	remaining: 48.2s
8:	learn: 3608567.4549235	total: 414ms	remain

In [8]:
# посчитаем r2 метрику на тестовых данных

test_metric = model.score(X_test, y_test) 
test_metric

0.02120854295068797

Сохраним метрики в словарь

In [9]:
metrics = {}

metrics['cv_neg_root_mean_squared_error'] = cv_res['test_neg_root_mean_squared_error']
metrics['cv_r2'] = cv_res['test_r2']
metrics['test_r2'] = test_metric

print(metrics)

{'cv_neg_root_mean_squared_error': -2444091.78, 'cv_r2': 0.748, 'test_r2': 0.02120854295068797}


Создадим новый эксперимент, залогируем модель и метрики

In [10]:
import mlflow

EXPERIMENT_NAME = 'experiment_project_sprint_2_v2' 
RUN_NAME = "basiline_model_registry_0"
REGISTRY_MODEL_NAME = "baseline_flat_model"

pip_requirements = "../requirements.txt"
signature = mlflow.models.infer_signature(x, y)
input_example = x[:10]

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if not experiment:
    experiment = mlflow.create_experiment(EXPERIMENT_NAME)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    model_info = mlflow.catboost.log_model( 
            await_registration_for=60,
			cb_model=model,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements
		)

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'baseline_flat_model' already exists. Creating a new version of this model...
2024/08/25 18:33:05 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: baseline_flat_model, version 2
Created version '2' of model 'baseline_flat_model'.


In [11]:
with mlflow.start_run(run_id=run_id, experiment_id=experiment_id) as run:
    mlflow.log_metrics(metrics)