In [1]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

In [3]:
import numpy as np
from dotenv import load_dotenv
import psycopg2 as psycopg
load_dotenv()

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ.get("DB_DESTINATION_HOST"), 
    "port": os.environ.get("DB_DESTINATION_PORT"),
    "dbname": os.environ.get("DB_DESTINATION_NAME"),
    "user": os.environ.get("DB_DESTINATION_USER"),
    "password": os.environ.get("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

# определим название таблицы, в которой хранятся наши данные.
TABLE_NAME = "clean_users_churn"

# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

    # создаёт объект курсора для выполнения запросов к базе данных
    # с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
        # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

        # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score,roc_auc_score,recall_score,confusion_matrix,log_loss,precision_score
from sklearn.model_selection import train_test_split

In [24]:
os.environ['MLFLOW_S3_ENDPOINT_URL']='https://storage.yandexcloud.net'
os.environ['AWS_BUCKET_NAME']=os.environ.get("S3_BUCKET_NAME")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [8]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'begin_date'
stratify_column = target
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features], 
    df[target], 
    test_size=test_size, 
    shuffle=False)

In [9]:
cat_features = ["senior_citizen"]
num_features = ["monthly_charges", "total_charges"]

In [12]:
from autofeat import AutoFeatClassifier

transformations = ["1/","log", "abs", "sqrt","^2"]

afc = AutoFeatClassifier(
    categorical_cols = cat_features,
    feateng_cols = num_features,
    feateng_steps=2,
    max_gb=1,
    transformations = transformations,
    n_jobs=-1
)

X_train_features = afc.fit_transform(X_train,y_train)
X_test_features = afc.transform(X_test)

In [13]:
X_train_features

Unnamed: 0,monthly_charges,total_charges,cat_senior_citizen_0.0,cat_senior_citizen_1.0,monthly_charges**2/total_charges
0,117.80,8684.80,1.0,0.0,1.597831
1,104.15,7689.95,0.0,1.0,1.410571
2,92.45,6440.25,0.0,1.0,1.327123
3,108.05,7532.15,1.0,0.0,1.549996
4,108.60,7690.90,1.0,0.0,1.533495
...,...,...,...,...,...
5629,75.15,525.00,1.0,0.0,10.757186
5630,76.05,231.80,0.0,1.0,24.950830
5631,69.05,318.50,0.0,1.0,14.969867
5632,25.90,135.00,1.0,0.0,4.968963


In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

X_train_scaled

array([[ 1.64692241,  2.63056585,  0.44835615, -0.44835615, -0.36964437],
       [ 1.19808034,  2.18456065, -2.23036977,  2.23036977, -0.43093812],
       [ 0.81335857,  1.62430261, -2.23036977,  2.23036977, -0.45825248],
       ...,
       [ 0.04391502, -1.12016374, -2.23036977,  2.23036977,  4.0072904 ],
       [-1.37495204, -1.20242936,  0.44835615, -0.44835615,  0.73379464],
       [-0.72224031, -1.10864207, -2.23036977,  2.23036977,  1.09776977]])

In [17]:
print(f"Размер выборки для обучения: {X_train_scaled.shape}")
print(f"Размер выборки для теста: {X_test_scaled.shape}")

Размер выборки для обучения: (5634, 5)
Размер выборки для теста: (1409, 5)


In [18]:
from sklearn.model_selection import GridSearchCV

In [20]:
EXPERIMENT_NAME = 'Grid_search'
RUN_NAME = 'model_grid_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'linearregression_gscv'

loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
} 

model = CatBoostClassifier(
    loss_function = loss_function,
    task_type = task_type ,
    random_seed = random_seed,
    iterations = iterations,
    verbose = verbose
    )

cv = GridSearchCV(
    estimator=model,
    cv=2,
    scoring='accuracy',
    n_jobs=-1,
    param_grid=params
)

clf = cv.fit(X_train_scaled,y_train)

cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = clf.best_estimator_

model_best.fit(X_train_scaled, y_train)

<catboost.core.CatBoostClassifier at 0x7fca3275cd30>

In [27]:


prediction = model_best.predict(X_test_scaled)
probas = model_best.predict_proba(X_test_scaled)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test,prediction,normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision =  precision_score(y_test,prediction)
recall = recall_score(y_test,prediction)
f1 = f1_score(y_test,prediction)
logloss = log_loss(y_test,prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

# настройки для логирования в MLFlow
pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

metrics

  inputs = _infer_schema(model_input) if model_input is not None else None


{'err1': 0.36621717530163234,
 'err2': 0.45564229950319374,
 'auc': 0.7028098084625676,
 'precision': 0.5544041450777202,
 'recall': 0.963963963963964,
 'f1': 0.7039473684210527,
 'logloss': 13.813749347142128,
 'mean_fit_time': 0.0642505168914795,
 'std_fit_time': 0.015160491731431748,
 'mean_test_score': 0.6605190707214138,
 'std_test_score': 0.12627381374985208,
 'best_score': 0.6879659211927582}

In [28]:
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    # логируем модель
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
            cb_model=model_best, 
			signature=signature, 
            input_example=input_example, 
            artifact_path='models', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60,
            pip_requirements=pip_requirements)
    
run_id

Successfully registered model 'linearregression_gscv'.
2024/09/11 14:55:08 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: linearregression_gscv, version 1
Created version '1' of model 'linearregression_gscv'.


'2c685ded47ad4f17820d3ea045e2c045'

In [29]:
EXPERIMENT_NAME = 'Grid_search'
RUN_NAME = 'model_randomized_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'linearregression_gscv'

from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
} 

model = CatBoostClassifier(
    loss_function = loss_function,
    task_type = task_type ,
    random_seed = random_seed,
    iterations = iterations,
    verbose = verbose
    )

cv = RandomizedSearchCV(
    estimator=model,
    cv=2,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    param_distributions=param_distributions
)

clf = cv.fit(X_train,y_train)
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = clf.best_estimator_

model_best.fit(X_train_scaled, y_train)

rediction = model_best.predict(X_test_scaled)
probas = model_best.predict_proba(X_test_scaled)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test,prediction,normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision =  precision_score(y_test,prediction)
recall = recall_score(y_test,prediction)
f1 = f1_score(y_test,prediction)
logloss = log_loss(y_test,prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics['mean_fit_time'] = cv_results['mean_fit_time'].mean()
metrics['std_fit_time'] = cv_results['std_fit_time'].mean()
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

# настройки для логирования в MLFlow
pip_requirements = './requirements.txt'
signature = mlflow.models.infer_signature(X_test_scaled, prediction)
input_example = X_test[:10]

metrics

  inputs = _infer_schema(model_input) if model_input is not None else None


{'err1': 0.36621717530163234,
 'err2': 0.45564229950319374,
 'auc': 0.7067686798507794,
 'precision': 0.5544041450777202,
 'recall': 0.963963963963964,
 'f1': 0.7039473684210527,
 'logloss': 13.813749347142128,
 'mean_fit_time': 0.11650856137275696,
 'std_fit_time': 0.002990823984146118,
 'mean_test_score': 0.7399982250621229,
 'std_test_score': 0.04674298899538518,
 'best_score': 0.7868299609513667}

In [30]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    # логируем модель
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
            cb_model=model_best, 
			signature=signature, 
            input_example=input_example, 
            artifact_path='models', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60,
            pip_requirements=pip_requirements)
    
run_id

Registered model 'linearregression_gscv' already exists. Creating a new version of this model...
2024/09/11 15:02:52 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: linearregression_gscv, version 2
Created version '2' of model 'linearregression_gscv'.


'5a2f1e7706394c3796f0edd7038aa1e3'