In [1]:
import os
import optuna
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
import numpy as np
from dotenv import load_dotenv
import psycopg2 as psycopg
load_dotenv()


True

In [2]:
connection = {
    "sslmode": "require", 
    "target_session_attrs": "read-write",
    "host": os.environ.get("DB_DESTINATION_HOST"), 
    "port": os.environ.get("DB_DESTINATION_PORT"),
    "dbname": os.environ.get("DB_DESTINATION_NAME"),
    "user": os.environ.get("DB_DESTINATION_USER"),
    "password": os.environ.get("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(connection.values())])

TABLE_NAME = "clean_users_churn"

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score,roc_auc_score,recall_score,confusion_matrix,log_loss,precision_score
from sklearn.model_selection import train_test_split

In [4]:
os.environ['MLFLOW_S3_ENDPOINT_URL']='https://storage.yandexcloud.net'
os.environ['AWS_BUCKET_NAME']=os.environ.get("S3_BUCKET_NAME")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [5]:
df.columns

Index(['id', 'customer_id', 'begin_date', 'end_date', 'type',
       'paperless_billing', 'payment_method', 'monthly_charges',
       'total_charges', 'internet_service', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines',
       'target'],
      dtype='object')

In [6]:
features = ['type',
    'paperless_billing', 'payment_method', 'monthly_charges',
    'total_charges', 'internet_service', 'online_security', 'online_backup',
    'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
    'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines']
target = ["target"]

split_column = 'begin_date'
stratify_column = target
test_size = 0.2

df = df.sort_values(by=[split_column]).reset_index()

X_train, X_test, y_train, y_test = train_test_split(
    df[features], 
    df[target], 
    test_size=test_size, 
    shuffle=False)

In [7]:
cat_features = ['type',
    'paperless_billing', 'payment_method', 
    'internet_service', 'online_security', 'online_backup',
    'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
    'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines']
num_features = ["monthly_charges", "total_charges"]

In [8]:
from autofeat import AutoFeatClassifier

transformations = ["1/","log", "abs", "sqrt","^2"]

afc = AutoFeatClassifier(
    categorical_cols = cat_features,
    feateng_cols = num_features,
    feateng_steps=2,
    max_gb=1,
    transformations = transformations,
    n_jobs=-1
)

X_train_features = afc.fit_transform(X_train,y_train)
X_test_features = afc.transform(X_test)
X_train_features

  y = column_or_1d(y, warn=True)


Unnamed: 0,monthly_charges,total_charges,cat_type_Month-to-month,cat_type_One year,cat_type_Two year,cat_paperless_billing_No,cat_paperless_billing_Yes,cat_payment_method_Bank transfer (automatic),cat_payment_method_Credit card (automatic),cat_payment_method_Electronic check,...,cat_gender_Male,cat_senior_citizen_0,cat_senior_citizen_1,cat_partner_No,cat_partner_Yes,cat_dependents_No,cat_dependents_Yes,cat_multiple_lines_No,cat_multiple_lines_Yes,monthly_charges**2/total_charges
0,117.8,8684.8,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.597831
1,104.15,7689.95,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.410571
2,92.45,6440.25,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.327123
3,108.05,7532.15,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.549996
4,108.6,7690.9,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.533495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,75.15,525.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,10.757186
5630,76.05,231.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,24.950830
5631,69.05,318.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,14.969867
5632,25.9,135.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,4.968963


In [9]:
y_train

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
5629,0
5630,1
5631,1
5632,1


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

X_train_scaled = pd.DataFrame(X_train_scaled,columns = X_train_features.columns)
X_train_scaled

Unnamed: 0,monthly_charges,total_charges,cat_type_Month-to-month,cat_type_One year,cat_type_Two year,cat_paperless_billing_No,cat_paperless_billing_Yes,cat_payment_method_Bank transfer (automatic),cat_payment_method_Credit card (automatic),cat_payment_method_Electronic check,...,cat_gender_Male,cat_senior_citizen_0,cat_senior_citizen_1,cat_partner_No,cat_partner_Yes,cat_dependents_No,cat_dependents_Yes,cat_multiple_lines_No,cat_multiple_lines_Yes,monthly_charges**2/total_charges
0,1.646922,2.630566,-0.907359,1.717822,-0.647413,-0.816617,0.816617,1.709795,-0.574754,-0.667073,...,0.989758,0.448356,-0.448356,-0.905735,0.905735,0.698089,-0.698089,-1.035048,1.035048,-0.369644
1,1.198080,2.184561,-0.907359,-0.582133,1.544609,-0.816617,0.816617,1.709795,-0.574754,-0.667073,...,0.989758,-2.230370,2.230370,-0.905735,0.905735,0.698089,-0.698089,-1.035048,1.035048,-0.430938
2,0.813359,1.624303,-0.907359,-0.582133,1.544609,1.224564,-1.224564,-0.584865,1.739874,-0.667073,...,-1.010348,-2.230370,2.230370,-0.905735,0.905735,-1.432482,1.432482,-1.035048,1.035048,-0.458252
3,1.326321,2.113817,-0.907359,-0.582133,1.544609,-0.816617,0.816617,-0.584865,-0.574754,1.499087,...,0.989758,0.448356,-0.448356,1.104076,-1.104076,0.698089,-0.698089,-1.035048,1.035048,-0.385302
4,1.344406,2.184987,-0.907359,-0.582133,1.544609,-0.816617,0.816617,-0.584865,1.739874,-0.667073,...,0.989758,0.448356,-0.448356,-0.905735,0.905735,0.698089,-0.698089,-1.035048,1.035048,-0.390703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0.244496,-1.027587,1.102099,-0.582133,-0.647413,-0.816617,0.816617,-0.584865,-0.574754,1.499087,...,-1.010348,0.448356,-0.448356,1.104076,-1.104076,0.698089,-0.698089,0.966139,-0.966139,2.628396
5630,0.274090,-1.159033,1.102099,-0.582133,-0.647413,-0.816617,0.816617,-0.584865,-0.574754,1.499087,...,0.989758,-2.230370,2.230370,1.104076,-1.104076,0.698089,-0.698089,-1.035048,1.035048,7.274259
5631,0.043915,-1.120164,1.102099,-0.582133,-0.647413,-0.816617,0.816617,-0.584865,-0.574754,-0.667073,...,-1.010348,-2.230370,2.230370,1.104076,-1.104076,0.698089,-0.698089,0.966139,-0.966139,4.007290
5632,-1.374952,-1.202429,1.102099,-0.582133,-0.647413,-0.816617,0.816617,1.709795,-0.574754,-0.667073,...,0.989758,0.448356,-0.448356,-0.905735,0.905735,-1.432482,1.432482,-1.035048,1.035048,0.733795


In [16]:
print(f"Размер выборки для обучения: {X_train_scaled.shape}")
print(f"Размер выборки для теста: {X_test_scaled.shape}")

Размер выборки для обучения: (5634, 36)
Размер выборки для теста: (1409, 36)


In [17]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from optuna.integration.mlflow import MLflowCallback

In [68]:
mlflow.end_run()

In [70]:
EXPERIMENT_NAME = 'optuna_search_new'
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

#MLFLOW_PARENT_RUN_ID = 'parentRunId'

REGISTRY_MODEL_NAME = 'catboost_optuna'
np.random.seed(42)

def objective(trial: optuna.Trial) -> float:
    param =  {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
 
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train_scaled, y_train)):
        
        train_x = X_train_scaled.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train_scaled.iloc[val_index]
        val_y = y_train.iloc[val_index]

        model.fit(train_x,train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:,1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = np.median(np.array(metrics['err1']))
    err_2 = np.median(np.array(metrics['err2']))
    auc = np.median(np.array(metrics['auc']))
    precision = np.median(np.array(metrics['precision']))
    recall = np.median(np.array(metrics['recall']))
    f1 = np.median(np.array(metrics['f1']))
    logloss = np.median(np.array(metrics['logloss']))
		

    return auc
 

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    
STUDY_DB_NAME = "sqlite:///optuna.db"
STUDY_NAME = "bagging-optimization-study"

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    model_info = mlflow.sklearn.log_model( 
            sk_model=afc, 
            artifact_path='cv', 
            registered_model_name="autofeat_model_cosmo"
        )


mlflc = MLflowCallback(
        tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
        metric_name="AUC",
        create_experiment=False,
        mlflow_kwargs={'experiment_id': experiment_id, 'tags': {'mlflow.parentRunId': run_id}}
    )
    
study = optuna.create_study(direction='maximize', study_name=STUDY_NAME, storage=STUDY_DB_NAME, load_if_exists=True, sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=10, callbacks=[mlflc])
best_params = study.best_params

model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train_scaled,y_train)
prediction = model_best.predict(X_test_scaled)

pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test_scaled, prediction)
input_example = X_test_scaled[:10]


print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")
print(f"run_id: {run_id}")

Successfully registered model 'autofeat_model_cosmo'.
2024/09/15 21:05:32 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: autofeat_model_cosmo, version 1
Created version '1' of model 'autofeat_model_cosmo'.
  mlflc = MLflowCallback(
[I 2024-09-15 21:05:33,045] Using an existing study with name 'bagging-optimization-study' instead of creating a new one.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-09-15 21:05:34,009] Trial 123 finished with value: 0.8597520537403536 and parameters: {'learning_rate': 0.0018690941463717996, 'depth': 1, 'l2_leaf_reg': 3.252529509463389, 'random_strength': 4.411183976770571}. Best is trial 28 with value: 0.8638600516705786.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-09-15 21:05:34,911] Trial 124 finished with value: 0.8404334333529166 and parameters: {'learning_rate': 0.0017890825628358966, 'depth': 1, 'l2_leaf_reg': 3.6096348933121476, 'random_str

0:	learn: 0.6923053	total: 1.44ms	remaining: 1.44s
1:	learn: 0.6910789	total: 2.86ms	remaining: 1.43s
2:	learn: 0.6901315	total: 4.06ms	remaining: 1.35s
3:	learn: 0.6892682	total: 5.46ms	remaining: 1.36s
4:	learn: 0.6884725	total: 6.67ms	remaining: 1.33s
5:	learn: 0.6876615	total: 7.95ms	remaining: 1.32s
6:	learn: 0.6868635	total: 9.26ms	remaining: 1.31s
7:	learn: 0.6861047	total: 10.4ms	remaining: 1.29s
8:	learn: 0.6850298	total: 11.7ms	remaining: 1.29s
9:	learn: 0.6837702	total: 12.9ms	remaining: 1.28s
10:	learn: 0.6830477	total: 14.1ms	remaining: 1.26s
11:	learn: 0.6823013	total: 15.5ms	remaining: 1.27s
12:	learn: 0.6813622	total: 16.6ms	remaining: 1.26s
13:	learn: 0.6801638	total: 17.9ms	remaining: 1.26s
14:	learn: 0.6790760	total: 19.2ms	remaining: 1.26s
15:	learn: 0.6782227	total: 20.4ms	remaining: 1.25s
16:	learn: 0.6771822	total: 21.7ms	remaining: 1.26s
17:	learn: 0.6764566	total: 23ms	remaining: 1.25s
18:	learn: 0.6756994	total: 24.2ms	remaining: 1.25s
19:	learn: 0.6747861	tot

In [47]:
run_id

'9c0823790129436e9422c91ef764ed26'

In [38]:
q = f"""select *
            from tags
            join runs on runs.run_uuid = 'd4aa4d5323be42f8a03f7e81bc231a4e'
                and runs.lifecycle_stage = 'active'
                and runs.status = 'FINISHED'
                and tags.value = 'd4aa4d5323be42f8a03f7e81bc231a4e'
                and tags.key = 'mlflow.parentRunId'
    """

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(q)
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

run_sel = pd.DataFrame(data, columns=columns)
run_sel

Unnamed: 0,key,value,run_uuid,run_uuid.1,name,source_type,source_name,entry_point_name,user_id,status,start_time,end_time,source_version,lifecycle_stage,artifact_uri,experiment_id,deleted_time
0,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,0669bc6cbb2346d2afff849a7dd74979,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
1,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,45370447efa046e8b498085f6da42c6d,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
2,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,48ce0bee82c54a178b6cbd067cd23682,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
3,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,8c9360381a8d464bb0d37521c0c7b37f,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
4,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,b08c8309547045a29aff296edf63af6b,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
5,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,b6de1afa948e43d0b5e2813c01f02917,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
6,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,b87b62a136ff478fb91afa2e4b5050d1,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
7,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,c68ab7943d1a48ddbb08bd1e44c49281,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
8,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,e2fd5bb832054093884210bb075a6436,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,
9,mlflow.parentRunId,d4aa4d5323be42f8a03f7e81bc231a4e,ec8d7d354efe4bb887f18c9f337c1626,d4aa4d5323be42f8a03f7e81bc231a4e,model_bayesian_search,UNKNOWN,,,mle-user,FINISHED,1726430688780,1726430688860,,active,s3://s3-student-mle-20240729-393dbfd5ab/37/d4a...,37,


In [48]:
model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train_scaled,y_train)
prediction = model_best.predict(X_test_scaled)

pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test_scaled, prediction)
input_example = X_test_scaled[:10]



0:	learn: 0.6923053	total: 1.7ms	remaining: 1.69s
1:	learn: 0.6910789	total: 3.2ms	remaining: 1.6s
2:	learn: 0.6901315	total: 4.46ms	remaining: 1.48s
3:	learn: 0.6892682	total: 5.65ms	remaining: 1.41s
4:	learn: 0.6884725	total: 6.87ms	remaining: 1.37s
5:	learn: 0.6876615	total: 8.02ms	remaining: 1.33s
6:	learn: 0.6868635	total: 9.32ms	remaining: 1.32s
7:	learn: 0.6861047	total: 11.2ms	remaining: 1.38s
8:	learn: 0.6850298	total: 12.3ms	remaining: 1.35s
9:	learn: 0.6837702	total: 13.5ms	remaining: 1.33s
10:	learn: 0.6830477	total: 14.7ms	remaining: 1.32s
11:	learn: 0.6823013	total: 16ms	remaining: 1.31s
12:	learn: 0.6813622	total: 17.1ms	remaining: 1.3s
13:	learn: 0.6801638	total: 18.4ms	remaining: 1.3s
14:	learn: 0.6790760	total: 19.6ms	remaining: 1.28s
15:	learn: 0.6782227	total: 20.7ms	remaining: 1.27s
16:	learn: 0.6771822	total: 22.9ms	remaining: 1.32s
17:	learn: 0.6764566	total: 24.1ms	remaining: 1.31s
18:	learn: 0.6756994	total: 26ms	remaining: 1.34s
19:	learn: 0.6747861	total: 27.

In [50]:
model_info = mlflow.catboost.log_model( 
            cb_model=model_best, 
			signature=signature, 
            input_example=input_example, 
            artifact_path='cv', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60,
            pip_requirements=pip_requirements)

Registered model 'catboost_optuna' already exists. Creating a new version of this model...
2024/09/15 20:29:13 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: catboost_optuna, version 5
Created version '5' of model 'catboost_optuna'.


In [51]:
run_id

'9c0823790129436e9422c91ef764ed26'

In [49]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id,run_id=run_id) as run:
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    #mlflow.log_metrics(metrics)
    #mlflow.log_params(best_params)
    # логируем модель
    #cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
            cb_model=model_best, 
			signature=signature, 
            input_example=input_example, 
            artifact_path='cv', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60,
            pip_requirements=pip_requirements)
    
run_id

Exception: Run with UUID 3ae145633bfc47c7bd100ffe128bf8a1 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [29]:
EXPERIMENT_NAME = 'Grid_search'
RUN_NAME = 'model_randomized_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'linearregression_gscv'

from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
} 

model = CatBoostClassifier(
    loss_function = loss_function,
    task_type = task_type ,
    random_seed = random_seed,
    iterations = iterations,
    verbose = verbose
    )

cv = RandomizedSearchCV(
    estimator=model,
    cv=2,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    param_distributions=param_distributions
)

clf = cv.fit(X_train,y_train)
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = clf.best_estimator_

model_best.fit(X_train_scaled, y_train)

rediction = model_best.predict(X_test_scaled)
probas = model_best.predict_proba(X_test_scaled)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test,prediction,normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision =  precision_score(y_test,prediction)
recall = recall_score(y_test,prediction)
f1 = f1_score(y_test,prediction)
logloss = log_loss(y_test,prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

# настройки для логирования в MLFlow
pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test_scaled, prediction)
input_example = X_test[:10]

metrics

  inputs = _infer_schema(model_input) if model_input is not None else None


{'err1': 0.36621717530163234,
 'err2': 0.45564229950319374,
 'auc': 0.7067686798507794,
 'precision': 0.5544041450777202,
 'recall': 0.963963963963964,
 'f1': 0.7039473684210527,
 'logloss': 13.813749347142128,
 'mean_fit_time': 0.11650856137275696,
 'std_fit_time': 0.002990823984146118,
 'mean_test_score': 0.7399982250621229,
 'std_test_score': 0.04674298899538518,
 'best_score': 0.7868299609513667}

In [30]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    # логируем модель
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
            cb_model=model_best, 
			signature=signature, 
            input_example=input_example, 
            artifact_path='models', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60,
            pip_requirements=pip_requirements)
    
run_id

Registered model 'linearregression_gscv' already exists. Creating a new version of this model...
2024/09/11 15:02:52 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: linearregression_gscv, version 2
Created version '2' of model 'linearregression_gscv'.


'5a2f1e7706394c3796f0edd7038aa1e3'