In [27]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

TABLE_NAME = "clean_users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_ujhmaster'
RUN_NAME = "autofeat" 
REGISTRY_MODEL_NAME = 'churn_model_ujhmaster_autofeat'

In [2]:
import psycopg2 as psycopg
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ.get("DB_DESTINATION_HOST"), 
    "port": os.environ.get("DB_DESTINATION_PORT"),
    "dbname": os.environ.get("DB_DESTINATION_NAME"),
    "user": os.environ.get("DB_DESTINATION_USER"),
    "password": os.environ.get("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [3]:
df.columns

Index(['id', 'customer_id', 'begin_date', 'end_date', 'type',
       'paperless_billing', 'payment_method', 'monthly_charges',
       'total_charges', 'internet_service', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines',
       'target'],
      dtype='object')

In [4]:
features = ['begin_date','type',
       'paperless_billing', 'payment_method', 'monthly_charges',
       'total_charges', 'internet_service', 'online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
       'gender', 'senior_citizen', 'partner', 'dependents', 'multiple_lines']
target = ['target']

In [6]:
from sklearn.model_selection import train_test_split

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

In [7]:
from autofeat import AutoFeatClassifier

In [12]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

transformations = ["1/","log", "abs", "sqrt"]

afc = AutoFeatClassifier(
    categorical_cols = cat_features,
    #feateng_cols = num_features,
    feateng_steps=1,
    max_gb=1,
    transformations = transformations,
    n_jobs=-1
)

X_train_features = afc.fit_transform(X_train[features],y_train)
X_test_features = afc.transform(X_test[features])

  y = column_or_1d(y, warn=True)


In [14]:
X_train_features.columns

Index(['monthly_charges', 'total_charges', 'cat_paperless_billing_No',
       'cat_paperless_billing_Yes',
       'cat_payment_method_Bank transfer (automatic)',
       'cat_payment_method_Credit card (automatic)',
       'cat_payment_method_Electronic check',
       'cat_payment_method_Mailed check', 'cat_internet_service_DSL',
       'cat_internet_service_Fiber optic', 'cat_online_security_No',
       'cat_online_security_Yes', 'cat_online_backup_No',
       'cat_online_backup_Yes', 'cat_device_protection_No',
       'cat_device_protection_Yes', 'cat_tech_support_No',
       'cat_tech_support_Yes', 'cat_streaming_tv_No', 'cat_streaming_tv_Yes',
       'cat_streaming_movies_No', 'cat_streaming_movies_Yes',
       'cat_gender_Female', 'cat_gender_Male', 'cat_senior_citizen_0',
       'cat_senior_citizen_1', 'cat_partner_No', 'cat_partner_Yes',
       'cat_dependents_No', 'cat_dependents_Yes', 'cat_multiple_lines_No',
       'cat_multiple_lines_Yes'],
      dtype='object')

In [20]:
import mlflow

os.environ['MLFLOW_S3_ENDPOINT_URL']='https://storage.yandexcloud.net'
os.environ['AWS_BUCKET_NAME']=os.environ.get("S3_BUCKET_NAME")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")



In [21]:
artifact_path = "afc"
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)


run_id

'11a58a00e1d44806865424aeda268343'

In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2',C=0.2)
pipline = Pipeline([
            ('preprocessor', afc),
            ('model', model)
        ]
    )
pipline.fit(X_train[features], y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [24]:
prediction = pipline.predict(X_test[features])
probas = pipline.predict_proba(X_test[features])

In [26]:
from sklearn.metrics import f1_score,roc_auc_score,recall_score,confusion_matrix,log_loss,precision_score

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test,prediction,normalize='all').ravel()
auc = roc_auc_score(y_test,probas[:,1])
precision = precision_score(y_test,prediction)
recall = recall_score(y_test,prediction)
f1 = f1_score(y_test,prediction)
logloss = log_loss(y_test,prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

print(metrics)

{'err1': 0.11000709723207949, 'err2': 0.24414478353442157, 'auc': 0.7385548401699141, 'precision': 0.6893787575150301, 'recall': 0.5165165165165165, 'f1': 0.590557939914163, 'logloss': 12.202145256642215}


In [28]:
pip_requirements = "./requirements.txt"
metadata = {'model_type': 'monthly'}
input_example = X_test[features][:10]
signature = mlflow.models.infer_signature(X_test[features], prediction)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    mlflow.log_metrics(metrics)
    
    # логируем модель
    model_info = mlflow.sklearn.log_model( 
            sk_model=pipline, 
			pip_requirements=pip_requirements,
            signature=signature, 
            metadata=metadata,
            input_example=input_example, 
            artifact_path='models', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60
		)


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# получаем данные о запуске эксперимента по его уникальному идентификатору
run = mlflow.get_run(run_id)


# проверяем, что статус запуска эксперимента изменён на 'FINISHED'
# это утверждение (assert) можно использовать для автоматической проверки того, 
# что эксперимент был завершён успешно
assert 'FINISHED' == run.info.status

run_id

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'churn_model_ujhmaster_autofeat'.
2024/09/10 12:39:56 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_ujhmaster_autofeat, version 1
Created version '1' of model 'churn_model_ujhmaster_autofeat'.


'a3f323b2f779478fb14a5376a0906722'