In [11]:
%%time
import os
#import pandas as pd
#import numpy as np
import psycopg
import mlflow

from pgsqlconn import fetch_data
from my_helpers import print_types, transform_data

#from sklearn.model_selection import StratifiedKFold
#import optuna
#from optuna.integration import MLflowCallback
#from collections import defaultdict
#from statistics import median
#from numpy import array

#from catboost import CatBoostClassifier
#from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
#from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, confusion_matrix, log_loss

#!pip install optuna==3.4.0
#!pip install mlflow==2.7.1

import yaml
import os
import joblib
import json

#!pip show scikit-learn # Version: 1.3.1
#!pip install scikit-learn==1.4.0
import sklearn
print('sklearn version:', sklearn.__version__)

sklearn version: 1.4.0
CPU times: user 141 µs, sys: 32 µs, total: 173 µs
Wall time: 164 µs


In [15]:
def load_data(path, loader):
    with open(path, 'r') as file:
        return  loader(file)

project_path = './mle-project-sprint-2-v001/mlflow_server'
path_to_params = project_path + '/models/params.yaml'
path_to_model = project_path + '/models/fitted_model.pkl'
path_to_results = project_path + '/results/cv_res.json'

params = load_data(path_to_params, yaml.safe_load)
model = joblib.load(path_to_model)
results = load_data(path_to_results, json.load)
len(params), len(model), len(results)

(9, 2, 6)

In [16]:
def get_global():
    print(f'TABLE_NAME: {TABLE_NAME}')
    print(f'EXPERIMENT_NAME: {EXPERIMENT_NAME}')
    print(f'RUN_NAME: {RUN_NAME}')
    print(f'REGISTRY_MODEL_NAME: {REGISTRY_MODEL_NAME}')

TABLE_NAME = "clean_flats"
REGISTRY_MODEL_NAME = 'model_sprint_2'
EXPERIMENT_NAME = 'Спринт 3/9: 2 спринт → Тема 5/5: Проект'
RUN_NAME = "ETL"

get_global()

TABLE_NAME: clean_flats
EXPERIMENT_NAME: Спринт 3/9: 2 спринт → Тема 5/5: Проект
RUN_NAME: ETL
REGISTRY_MODEL_NAME: model_sprint_2


In [17]:
df = fetch_data(TABLE_NAME)
#df = transform_data(df)
print_types(df, TABLE_NAME)

Data types of clean_flats:
  int64 (12):
    - id, floor, is_apartment, rooms, price
    - building_id, build_year, building_type_int, flats_count, floors_total
    - has_elevator, flat_id
  float64 (6):
    - kitchen_area, living_area, total_area, latitude, longitude
    - ceiling_height


In [6]:
mlflow.set_tracking_uri(os.getenv('TRACKING_SERVER_CONN'))

In [7]:
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID

In [8]:
%%time
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, stratify=df[target], shuffle=True)

def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
 
    model = CatBoostClassifier(**param)
    skf = StratifiedKFold(n_splits=2)
    metrics = defaultdict(list)

    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]

        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]
        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)

    err_1 = median(array(metrics['err1']))
    err_2 = median(array(metrics['err2']))
    auc = median(array(metrics['auc']))
    precision = median(array(metrics['precision']))
    recall = median(array(metrics['recall']))
    f1 = median(array(metrics['f1']))
    logloss = median(array(metrics['logloss']))

    return auc


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

mlflc = MLflowCallback(
    tracking_uri=os.getenv('TRACKING_SERVER_CONN'),
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
    #mlflow_kwargs={'experiment_id': experiment_id, MLFLOW_PARENT_RUN_ID: run_id}
)

study = optuna.create_study(direction='maximize', 
                            study_name=STUDY_NAME, storage=STUDY_DB_NAME, 
                            load_if_exists=True,
                            sampler=optuna.samplers.TPESampler())

study.optimize(objective, n_trials=10, callbacks=[mlflc])

best_params = study.best_params

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")
print(run_id)

[I 2024-04-24 13:41:45,701] Using an existing study with name 'churn_model' instead of creating a new one.
[I 2024-04-24 13:41:47,472] Trial 132 finished with value: 0.8181092895880762 and parameters: {'learning_rate': 0.057647231745658424, 'depth': 1, 'l2_leaf_reg': 3.4447367334736443, 'random_strength': 4.664390244724367}. Best is trial 108 with value: 0.8273315781718285.
[I 2024-04-24 13:41:49,360] Trial 133 finished with value: 0.821568517599551 and parameters: {'learning_rate': 0.04338721214889323, 'depth': 3, 'l2_leaf_reg': 3.1885961991058323, 'random_strength': 4.355040569167024}. Best is trial 108 with value: 0.8273315781718285.
[I 2024-04-24 13:41:51,217] Trial 134 finished with value: 0.8223877330091818 and parameters: {'learning_rate': 0.04959568216807987, 'depth': 2, 'l2_leaf_reg': 3.717864694116411, 'random_strength': 4.533215773549718}. Best is trial 108 with value: 0.8273315781718285.
[I 2024-04-24 13:41:53,478] Trial 135 finished with value: 0.8196821614209511 and param

Number of finished trials: 142
Best params: {'learning_rate': 0.0873446334064004, 'depth': 1, 'l2_leaf_reg': 3.6987968672196763, 'random_strength': 1.9133151232051957}
CPU times: user 15.2 s, sys: 2.57 s, total: 17.7 s
Wall time: 20.8 s


In [10]:
param = {
    "loss_function": "Logloss",
    "task_type": "CPU",
    "random_seed": 0,
    "iterations": 300,
    "verbose": False,
}
param.update(best_params)

model_best = CatBoostClassifier(**param)

model_best.fit(X_train, y_train)
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

metrics = {}
_, metrics["err1"], _, metrics["err2"] = confusion_matrix(y_test, prediction, normalize='all').ravel()
metrics["auc"] = roc_auc_score(y_test, probas)
metrics["precision"] =  precision_score(y_test, prediction)
metrics["recall"] = recall_score(y_test, prediction)
metrics["f1"] = f1_score(y_test, prediction)
metrics["logloss"] = log_loss(y_test, prediction)

pip_requirements = 'requirements.txt'
signature = signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

with mlflow.start_run(run_id=run_id) as run:
#    run_id = run.info.run_id
    model_info = mlflow.catboost.log_model(
        cb_model=model_best,
        input_example=input_example,
        #artifact_path="models",
        artifact_path="cv",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        await_registration_for=60,
        pip_requirements=pip_requirements)
    mlflow.sklearn.log_model(model_best, "cv", signature=signature)
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    mlflow.set_tag("Training Info", "Tuning hyperparameters of CatBoostClassifier + Optuna + TPE") 

 - mlflow (current: 2.12.1, required: mlflow==2.7.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Registered model 'CBClf_tuning' already exists. Creating a new version of this model...
2024/04/24 13:42:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: CBClf_tuning, version 12
Created version '12' of model 'CBClf_tuning'.


In [59]:
run = mlflow.get_run('0fc7e50678e848b1b912ebeed5030e97')
with mlflow.start_run(run_id=run.info.run_uuid):
#    mlflow.set_tag(MLFLOW_PARENT_RUN_ID, run.info.run_uuid)
    ...
run

<Run: data=<RunData: metrics={'AUC': 0.8209890619263405}, params={'depth': '4',
 'l2_leaf_reg': '4.9559340801613185',
 'learning_rate': '0.004658938745949589',
 'random_strength': '0.31802107073453'}, tags={'datetime_complete': '2024-04-24 12:47:59.101677',
 'datetime_start': '2024-04-24 12:47:57.637569',
 'depth_distribution': 'IntDistribution(high=12, log=False, low=1, step=1)',
 'direction': 'MAXIMIZE',
 'l2_leaf_reg_distribution': 'FloatDistribution(high=5.0, log=False, low=0.1, '
                             'step=None)',
 'learning_rate_distribution': 'FloatDistribution(high=0.1, log=True, '
                               'low=0.001, step=None)',
 'mlflow.parentRunId': '0fc7e50678e848b1b912ebeed5030e97',
 'mlflow.runName': '42',
 'mlflow.source.name': '/home/mle-user/mle-project/mle-mlflow/.venv_mle_mlflow/lib/python3.10/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'mle-user',
 'number': '42',
 'random_strength_distribution': 'FloatDistrib