In [12]:
import os
import psycopg
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "users_churn"

connection = {"target_session_attrs": "read-write"}#"sslmode": "verify-full"
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0
2,148,6837-BJYDQ,2019-11-01,NaT,One year,No,Mailed check,19.6,61.35,,...,,,,,Male,0,No,No,No,0
3,482,0486-LGCCH,2019-03-01,NaT,Two year,No,Mailed check,19.65,225.75,,...,,,,,Male,0,Yes,Yes,No,0
4,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0
2,148,6837-BJYDQ,2019-11-01,NaT,One year,No,Mailed check,19.6,61.35,,...,,,,,Male,0,No,No,No,0
3,482,0486-LGCCH,2019-03-01,NaT,Two year,No,Mailed check,19.65,225.75,,...,,,,,Male,0,Yes,Yes,No,0
4,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0


## Задача №1

In [26]:
import os
import optuna
import mlflow
import numpy as np
from collections import defaultdict
from catboost import CatBoostClassifier
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [52]:
#os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
#os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
#os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"


def objective(trial: optuna.Trial) -> float:
    param = {
       "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
       "depth": trial.suggest_int("depth", 1, 12),
       "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
       "random_strength": trial.suggest_float("random_strength", 0.1, 5),
       "loss_function": "Logloss",
       "task_type": "CPU",
       "random_seed": 0,
       "iterations": 300,
       "verbose": False,
     }
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
    
        model.fit(train_x, train_y)
    
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)

    err_1 = np.mean(metrics['err1'])
    err_2 = np.mean(metrics['err2'])
    auc = np.mean(metrics['auc'])
    precision = np.mean(metrics['precision'])
    recall = np.mean(metrics['recall'])
    f1 = np.mean(metrics['f1'])
    logloss = np.mean(metrics['logloss'])

    return auc


#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    

mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs = {
        "experiment_id": experiment_id, 
        "tags": {MLFLOW_PARENT_RUN_ID: run_id}
    }
)

study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(),
    direction='maximize',
    study_name=STUDY_NAME,
    storage=STUDY_DB_NAME,
    load_if_exists=True
)
study.optimize(objective, n_trials=20, callbacks=[mlflc]) 
best_params = study.best_params

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

  mlflc = MLflowCallback(
  mlflc = MLflowCallback(
[I 2024-04-08 17:50:18,075] A new study created in RDB with name: churn_model
[I 2024-04-08 17:50:18,075] A new study created in RDB with name: churn_model
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-04-08 17:50:19,410] Trial 0 finished with value: 0.7554531285644153 and parameters: {'learning_rate': 0.005172850059863517, 'depth': 3, 'l2_leaf_reg': 0.4158177278765839, 'random_strength': 1.4653413116093768}. Best is trial 0 with value: 0.7554531285644153.
[I 2024-04-08 17:50:19,410] Trial 0 finished with value: 0.7554531285644153 and parameters: {'learning_rate': 0.005172850059863517, 'depth': 3, 'l2_leaf_reg': 0.4158177278765839, 'random_strength': 1.4653413116093768}. Best is trial 0 with value: 0.7554531285644153.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-04-08 17:50:21,471] Trial 1 fi

[I 2024-04-08 17:50:32,497] Trial 6 finished with value: 0.80027125839604 and parameters: {'learning_rate': 0.016176612127534015, 'depth': 3, 'l2_leaf_reg': 3.3685524682383483, 'random_strength': 1.4420455275635158}. Best is trial 4 with value: 0.812919584072314.
[I 2024-04-08 17:50:32,497] Trial 6 finished with value: 0.80027125839604 and parameters: {'learning_rate': 0.016176612127534015, 'depth': 3, 'l2_leaf_reg': 3.3685524682383483, 'random_strength': 1.4420455275635158}. Best is trial 4 with value: 0.812919584072314.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-04-08 17:50:35,250] Trial 7 finished with value: 0.7003295031498556 and parameters: {'learning_rate': 0.001981372724212374, 'depth': 9, 'l2_leaf_reg': 2.2764285559433555, 'random_strength': 1.2542666296961833}. Best is trial 4 with value: 0.812919584072314.
[I 2024-04-08 17:50:35,250] Trial 7 finished with value: 0.7003295031498556 and parameters: {'le

[I 2024-04-08 17:50:45,734] Trial 13 finished with value: 0.8092751237118158 and parameters: {'learning_rate': 0.09894637960646284, 'depth': 5, 'l2_leaf_reg': 1.3541518996027775, 'random_strength': 2.6369775712399885}. Best is trial 4 with value: 0.812919584072314.
[I 2024-04-08 17:50:45,734] Trial 13 finished with value: 0.8092751237118158 and parameters: {'learning_rate': 0.09894637960646284, 'depth': 5, 'l2_leaf_reg': 1.3541518996027775, 'random_strength': 2.6369775712399885}. Best is trial 4 with value: 0.812919584072314.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-04-08 17:50:51,372] Trial 14 finished with value: 0.6596923687495551 and parameters: {'learning_rate': 0.027340298845964152, 'depth': 10, 'l2_leaf_reg': 4.909719251386411, 'random_strength': 0.13379821605431141}. Best is trial 4 with value: 0.812919584072314.
[I 2024-04-08 17:50:51,372] Trial 14 finished with value: 0.6596923687495551 and parameter

Number of finished trials: 20
Best params: {'learning_rate': 0.0531094096530869, 'depth': 6, 'l2_leaf_reg': 2.7915603359196863, 'random_strength': 4.853467764191209}
Number of finished trials: 20
Best params: {'learning_rate': 0.0531094096530869, 'depth': 6, 'l2_leaf_reg': 2.7915603359196863, 'random_strength': 4.853467764191209}


In [53]:
REGISTRY_MODEL_NAME = "best_model_bayesian_search"

best_model = CatBoostClassifier(**best_params, verbose=False).fit(X_train, y_train)

mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_id=run_id) as run: 
    mlflow.log_params(best_params) 
     
    mlflow.sklearn.log_model(
        sk_model=best_model, 
        artifact_path="cv", 
        registered_model_name=REGISTRY_MODEL_NAME 
    )

Registered model 'best_model_bayesian_search' already exists. Creating a new version of this model...
2024/04/08 17:51:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: best_model_bayesian_search, version 3
Registered model 'best_model_bayesian_search' already exists. Creating a new version of this model...
2024/04/08 17:51:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: best_model_bayesian_search, version 3
Created version '3' of model 'best_model_bayesian_search'.
Created version '3' of model 'best_model_bayesian_search'.


In [54]:
run_id

'f21ad48293de4e7786281a1a20612ba0'

'f21ad48293de4e7786281a1a20612ba0'