In [4]:
from dotenv import load_dotenv
import os
import psycopg
import pandas as pd
import mlflow

load_dotenv()

TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = "5000"

EXPERIMENT_NAME = 'search_params_experiment'
RUN_NAME = 'model_grid_search'
REGISTRY_MODEL_NAME = 'churn_model_nikolaistepanov'

In [5]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(5)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,21,8779-QRDMV,2019-11-01,2019-12-01,Month-to-month,Yes,Electronic check,39.65,39.65,DSL,...,Yes,No,No,Yes,Male,1,No,No,,1
2,22,1680-VDCWW,2019-02-01,NaT,One year,No,Bank transfer (automatic),19.8,202.25,,...,,,,,Male,0,Yes,No,No,0
3,23,1066-JKSGK,2019-11-01,2019-12-01,Month-to-month,No,Mailed check,20.15,20.15,,...,,,,,Male,0,No,No,No,1
4,24,3638-WEABW,2015-04-01,NaT,Two year,Yes,Credit card (automatic),59.9,3505.1,DSL,...,No,Yes,No,No,Female,0,Yes,No,Yes,0


In [6]:
from sklearn.model_selection import train_test_split

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
stratify_column = "target"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=test_size, shuffle=False)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [7]:
from catboost import CatBoostClassifier

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
} 

model = CatBoostClassifier(
    iterations=iterations,
    random_seed=random_seed,
    loss_function=loss_function,
    task_type=task_type,
    verbose=verbose
)

In [9]:
from sklearn.model_selection import GridSearchCV

cv = GridSearchCV(estimator=model, param_grid=params, cv=2, n_jobs=-1)

clf = cv.fit(X_train, y_train)
cv_results = pd.DataFrame(clf.cv_results_)

In [10]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [13]:
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss

best_params = clf.best_params_

model_best = CatBoostClassifier(
    random_seed=random_seed,
    loss_function=loss_function,
    task_type=task_type,
    verbose=verbose,
    **best_params
)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_ # лучший результат кросс-валидации

In [14]:
metrics

{'err1': 0.0773598296664301,
 'err2': 0.17317246273953157,
 'auc': 0.6400870601543247,
 'precision': 0.6912181303116147,
 'recall': 0.36472346786248133,
 'f1': 0.4774951076320939,
 'logloss': 13.660263243284994,
 'mean_fit_time': 0.043903219964769154,
 'std_fit_time': 0.007196700308057997,
 'mean_test_score': 0.7591006981422317,
 'std_test_score': 0.028866011911805302,
 'best_score': 0.7878949236776713}

In [21]:
# настройки для логирования в MLFlow
pip_requirements = './requirements.txt' # файл с зависимостями
signature = mlflow.models.infer_signature(X_test, prediction) # сигнатура модели
input_example = X_test[:10]

if mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
else:
    experiment_id = mlflow.create_experiment(name=EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    
    mlflow.log_params(best_params)
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')

    model_info = mlflow.catboost.log_model( 
        cb_model=model_best,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        await_registration_for=60,
        pip_requirements=pip_requirements
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_nikolaistepanov' already exists. Creating a new version of this model...
2024/05/28 21:00:41 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 8
Created version '8' of model 'churn_model_nikolaistepanov'.


In [23]:
run_id

'1892d8c81cbc455b82326cdecf40dfa6'

In [30]:
from sklearn.model_selection import RandomizedSearchCV

RUN_NAME = 'model_random_search'

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
}

cv = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=20, cv=2, random_state=random_seed, n_jobs=-1)

clf = cv.fit(X_train, y_train)

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(
    random_seed=random_seed,
    loss_function=loss_function,
    task_type=task_type,
    verbose=verbose,
    **best_params
)

model_best.fit(X_train, y_train)

model = model_best

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics['std_fit_time'] = cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics['mean_test_score'] = cv_results['mean_test_score'].mean() # средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_ # лучший результат кросс-валидации

# настройки для логирования в MLFlow
pip_requirements = './requirements.txt' # файл с зависимостями
signature = mlflow.models.infer_signature(X_test, prediction) # сигнатура модели
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    
    mlflow.log_params(best_params)
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')

    model_info = mlflow.catboost.log_model( 
        cb_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        await_registration_for=60,
        pip_requirements=pip_requirements
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_nikolaistepanov' already exists. Creating a new version of this model...
2024/05/28 21:28:49 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_nikolaistepanov, version 9
Created version '9' of model 'churn_model_nikolaistepanov'.


In [31]:
run_id

'9d972ef4cb9746419ef95284d4a95f74'