In [3]:
from dotenv import load_dotenv
import os
import psycopg
import pandas as pd
import mlflow

load_dotenv()

TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = "5000"

EXPERIMENT_NAME = 'bayesian_search_experiment'
RUN_NAME = 'model_bayesian_search'
REGISTRY_MODEL_NAME = 'churn_model_nikolaistepanov'

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

In [4]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [5]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(5)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,21,8779-QRDMV,2019-11-01,2019-12-01,Month-to-month,Yes,Electronic check,39.65,39.65,DSL,...,Yes,No,No,Yes,Male,1,No,No,,1
2,22,1680-VDCWW,2019-02-01,NaT,One year,No,Bank transfer (automatic),19.8,202.25,,...,,,,,Male,0,Yes,No,No,0
3,23,1066-JKSGK,2019-11-01,2019-12-01,Month-to-month,No,Mailed check,20.15,20.15,,...,,,,,Male,0,No,No,No,1
4,24,3638-WEABW,2015-04-01,NaT,Two year,Yes,Credit card (automatic),59.9,3505.1,DSL,...,No,Yes,No,No,Female,0,Yes,No,Yes,0


In [6]:
from sklearn.model_selection import train_test_split

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = "target"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from catboost import CatBoostClassifier
from collections import defaultdict
from numpy import array
from statistics import median
import optuna

In [12]:
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
        
        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    err_1 = median(array(metrics['err1']))
    err_2 = median(array(metrics['err2']))
    auc = median(array(metrics["auc"]))
    precision = median(array(metrics["precision"]))
    recall = median(array(metrics["recall"]))
    f1 = median(array(metrics["f1"]))
    logloss = median(array(metrics["logloss"]))
		

    return auc

In [19]:
MLFLOW_PARENT_RUN_ID = 'mlflow.parentRunId'

In [39]:
from optuna.integration.mlflow import MLflowCallback

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    

prediction=model_best.predict(X_test)
pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(
        await_registration_for=60, 
        cb_model=model_best,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements)

    cv_info = mlflow.sklearn.log_model(study, artifact_path='cv')
 
    mlflow.log_params(best_params)
    mlflow.log_metric("best_auc", study.best_value)

mlflc = MLflowCallback(
    tracking_uri=f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}',
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={
        'experiment_id': experiment_id,
        'tags': {
            MLFLOW_PARENT_RUN_ID: run_id
        }
    }
)

study = optuna.create_study(direction='maximize', study_name=STUDY_NAME, storage=STUDY_DB_NAME, load_if_exists=True, sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=10, callbacks=[mlflc])
best_params = study.best_params

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_nikolaistepanov' already exists. Creating a new version of this model...
2024/05/30 00:33:14 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 14
Created version '14' of model 'churn_model_nikolaistepanov'.
  mlflc = MLflowCallback(
[I 2024-05-30 00:33:16,458] Using an existing study with name 'churn_model' instead of creating a new one.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-30 00:33:17,672] Trial 75 finished with value: 0.8087973612990528 and parameters: {'learning_rate': 0.04827835018733121, 'depth': 6, 'l2_leaf_reg': 2.191958962960526, 'random_strength': 4.1180883558503485}. Best is trial 47 with value: 0.8278679897759735.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-05-30 00:33:19,020] Trial 76 finished with value: 0.82384209141

Number of finished trials: 85
Best params: {'learning_rate': 0.03478410895840877, 'depth': 3, 'l2_leaf_reg': 0.9283989554180812, 'random_strength': 4.697350643371248}


In [40]:
run_id

'219d98fa4e1f48c6b4c3af6d9e6bd587'

In [37]:
model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train, y_train)
 
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(
        await_registration_for=60, 
        cb_model=model_best,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements)

    cv_info = mlflow.sklearn.log_model(study, artifact_path='cv')
 
    mlflow.log_params(best_params)
    mlflow.log_metric("best_auc", study.best_value)

0:	learn: 0.6792202	total: 1.73ms	remaining: 1.73s
1:	learn: 0.6630192	total: 3.34ms	remaining: 1.67s
2:	learn: 0.6469986	total: 4.79ms	remaining: 1.59s
3:	learn: 0.6330420	total: 6.28ms	remaining: 1.56s
4:	learn: 0.6207282	total: 7.71ms	remaining: 1.53s
5:	learn: 0.6101373	total: 9.24ms	remaining: 1.53s
6:	learn: 0.6016137	total: 10.8ms	remaining: 1.54s
7:	learn: 0.5917794	total: 12.3ms	remaining: 1.53s
8:	learn: 0.5801058	total: 13.8ms	remaining: 1.51s
9:	learn: 0.5699085	total: 15.5ms	remaining: 1.54s
10:	learn: 0.5630179	total: 17.2ms	remaining: 1.54s
11:	learn: 0.5552007	total: 18.7ms	remaining: 1.54s
12:	learn: 0.5500764	total: 20.1ms	remaining: 1.52s
13:	learn: 0.5435016	total: 21.6ms	remaining: 1.52s
14:	learn: 0.5391561	total: 23ms	remaining: 1.51s
15:	learn: 0.5317646	total: 24.6ms	remaining: 1.51s
16:	learn: 0.5280441	total: 26ms	remaining: 1.5s
17:	learn: 0.5245995	total: 27.3ms	remaining: 1.49s
18:	learn: 0.5189176	total: 28.9ms	remaining: 1.49s
19:	learn: 0.5137003	total:

Registered model 'churn_model_nikolaistepanov' already exists. Creating a new version of this model...
2024/05/30 00:31:38 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 13
Created version '13' of model 'churn_model_nikolaistepanov'.


<optuna.study.study.Study at 0x7f5b2e9a8b50>

In [38]:
run_id

'e11773eaff4746d5895ebe507d85e384'

In [36]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"select * from tags join runs on runs.run_uuid = 'e416670506ab4c56a09846a941233f28' and runs.lifecycle_stage = 'active' and runs.status = 'FINISHED' and tags.value = 'e416670506ab4c56a09846a941233f28' and tags.key = 'mlflow.parentRunId'")
        result = cur.fetchall()

result

[('mlflow.parentRunId',
  'e416670506ab4c56a09846a941233f28',
  '0c098b60cd3b4742845604c9d37af858',
  'e416670506ab4c56a09846a941233f28',
  'model_bayesian_search',
  'UNKNOWN',
  '',
  '',
  'mle-user',
  'FINISHED',
  1717028277022,
  1717028277476,
  '',
  'active',
  's3://s3-student-mle-20240325-d3a8040a07/10/e416670506ab4c56a09846a941233f28/artifacts',
  10,
  None),
 ('mlflow.parentRunId',
  'e416670506ab4c56a09846a941233f28',
  '489c0df0cfbf4d8ca4f998ad591e62ba',
  'e416670506ab4c56a09846a941233f28',
  'model_bayesian_search',
  'UNKNOWN',
  '',
  '',
  'mle-user',
  'FINISHED',
  1717028277022,
  1717028277476,
  '',
  'active',
  's3://s3-student-mle-20240325-d3a8040a07/10/e416670506ab4c56a09846a941233f28/artifacts',
  10,
  None),
 ('mlflow.parentRunId',
  'e416670506ab4c56a09846a941233f28',
  '4f12710ae045460ba25285d983c0ee7e',
  'e416670506ab4c56a09846a941233f28',
  'model_bayesian_search',
  'UNKNOWN',
  '',
  '',
  'mle-user',
  'FINISHED',
  1717028277022,
  17170282774