In [1]:
import os
import psycopg
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "users_churn"

connection = {"target_session_attrs": "read-write"}#"sslmode": "verify-full"
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0


## Задача №1

In [7]:
import os
import mlflow
from numpy import linspace
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, roc_auc_score

In [8]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_grid_search"
REGISTRY_MODEL_NAME = "churn_model_grid_search"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
#stratify_column = ["type"]
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    "learning_rate": linspace(start=0.001, stop=0.1, num=5),
    "depth": [2, 5],
    "l2_leaf_reg": linspace(start=0.1, stop=5, num=5),
    "random_strength": linspace(start=0.1, stop=5, num=5),
}

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
)

cv = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    n_jobs=-1, 
    cv=2
)

clf = cv.fit(X_train, y_train)

#os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
#os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
#os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
    **best_params,
)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()  # Ошибки первого и второго рода
auc = roc_auc_score(y_test, probas)  # площадь под ROC-кривой
precision = precision_score(y_test, prediction)  # точность
recall = recall_score(y_test, prediction)  # полнота
f1 = f1_score(y_test, prediction)  # F1-мера
logloss = log_loss(y_test, prediction)  # LogLoss

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results["mean_fit_time"].mean()  # среднее время обучения
metrics["std_fit_time"] = cv_results["std_fit_time"].mean()  # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results["mean_test_score"].mean()  # средний результат на тесте
metrics["std_test_score"] = cv_results["std_test_score"].mean()  # стандартное отклонение результата на тесте
metrics["best_score"] = clf.best_score_ # лучший результат кросс-валидации

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# настройки для логирования в MLFlow
pip_requirements="./requirements.txt"  # файл с зависимостями
signature = mlflow.models.infer_signature(X_test, prediction)  # сигнатура модели
input_example = X_test[:10]  # пример входных данных

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path="cv")
    model_info = mlflow.catboost.log_model(
        cb_model=model, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'churn_model_grid_search'.
2024/04/06 14:19:25 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_grid_search, version 1
Created version '1' of model 'churn_model_grid_search'.


In [22]:
run_id

'ad2700c020bc459380e764899bb99d1b'

## Задача №2

In [24]:
TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_random_search"
REGISTRY_MODEL_NAME = "churn_model_random_search"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
#stratify_column = ["type"]
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    "learning_rate": linspace(start=0.001, stop=0.1, num=5),
    "depth": [2, 5],
    "l2_leaf_reg": linspace(start=0.1, stop=5, num=5),
    "random_strength": linspace(start=0.1, stop=5, num=5),
}

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
)

cv = RandomizedSearchCV(
    estimator=model, 
    param_distributions=params, 
    n_jobs=-1, 
    cv=2,
    n_iter=20,
)

clf = cv.fit(X_train, y_train)

#os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
#os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
#os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(
    iterations=iterations, 
    loss_function=loss_function, 
    random_seed=random_seed, 
    task_type=task_type,
    verbose=verbose,
    **best_params,
)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()  # Ошибки первого и второго рода
auc = roc_auc_score(y_test, probas)  # площадь под ROC-кривой
precision = precision_score(y_test, prediction)  # точность
recall = recall_score(y_test, prediction)  # полнота
f1 = f1_score(y_test, prediction)  # F1-мера
logloss = log_loss(y_test, prediction)  # LogLoss

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results["mean_fit_time"].mean()  # среднее время обучения
metrics["std_fit_time"] = cv_results["std_fit_time"].mean()  # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results["mean_test_score"].mean()  # средний результат на тесте
metrics["std_test_score"] = cv_results["std_test_score"].mean()  # стандартное отклонение результата на тесте
metrics["best_score"] = clf.best_score_ # лучший результат кросс-валидации

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# настройки для логирования в MLFlow
pip_requirements="./requirements.txt"  # файл с зависимостями
signature = mlflow.models.infer_signature(X_test, prediction)  # сигнатура модели
input_example = X_test[:10]  # пример входных данных

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path="cv")
    model_info = mlflow.catboost.log_model(
        cb_model=model, 
        artifact_path="models",
        signature=signature,
        input_example=input_example,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60,
        pip_requirements=pip_requirements,
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'churn_model_random_search'.
2024/04/06 14:20:46 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_random_search, version 1
Created version '1' of model 'churn_model_random_search'.


In [26]:
run_id

'f6fcd8feb4d04eb691f31c65ab9960e8'