In [6]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
    Binarizer
)

TABLE_NAME = "clean_users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_ujhmaster'
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'churn_model_ujhmaster_b2c'

In [15]:
import psycopg2 as psycopg
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ.get("DB_DESTINATION_HOST"), 
    "port": os.environ.get("DB_DESTINATION_PORT"),
    "dbname": os.environ.get("DB_DESTINATION_NAME"),
    "user": os.environ.get("DB_DESTINATION_USER"),
    "password": os.environ.get("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [8]:
df.dtypes.value_counts()

object            15
int64              3
datetime64[ns]     2
float64            2
Name: count, dtype: int64

In [16]:
obj_df = df.select_dtypes(include="object")
obj_df = obj_df.drop(['customer_id'],axis=1)
obj_df.head()

Unnamed: 0,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,One year,No,Credit card (automatic),Fiber optic,No,No,Yes,No,Yes,Yes,Male,Yes,No,Yes
1,Month-to-month,Yes,Bank transfer (automatic),Fiber optic,No,Yes,Yes,No,Yes,Yes,Male,No,No,Yes
2,Month-to-month,Yes,Electronic check,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Male,No,No,No
3,Two year,No,Credit card (automatic),Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Female,Yes,Yes,Yes
4,Two year,No,Bank transfer (automatic),Fiber optic,Yes,No,Yes,No,Yes,Yes,Male,No,Yes,Yes


In [19]:
binary_features = obj_df.nunique() == 2
binary_features.head(20)

type                 False
paperless_billing     True
payment_method       False
internet_service      True
online_security       True
online_backup         True
device_protection     True
tech_support          True
streaming_tv          True
streaming_movies      True
gender                True
partner               True
dependents            True
multiple_lines        True
dtype: bool

In [26]:
cat_columns = ["type", "payment_method", "internet_service", "gender"]
df[cat_columns].head()

Unnamed: 0,type,payment_method,internet_service,gender
0,One year,Credit card (automatic),Fiber optic,Male
1,Month-to-month,Bank transfer (automatic),Fiber optic,Male
2,Month-to-month,Electronic check,Fiber optic,Male
3,Two year,Credit card (automatic),Fiber optic,Female
4,Two year,Bank transfer (automatic),Fiber optic,Male


In [28]:
encoder_oh = OneHotEncoder(
    categories="auto",
    drop="first",
    sparse_output=False,
    max_categories=10,
    handle_unknown="ignore"
    )

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(encoded_features,columns = encoder_oh.get_feature_names_out())

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,...,partner,dependents,multiple_lines,x0_One year,x0_Two year,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x3_Male
0,One year,No,Credit card (automatic),Fiber optic,No,No,Yes,No,Yes,Yes,...,Yes,No,Yes,1.0,0.0,1.0,0.0,0.0,1.0,1.0
1,Month-to-month,Yes,Bank transfer (automatic),Fiber optic,No,Yes,Yes,No,Yes,Yes,...,No,No,Yes,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [81]:
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None


# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots,degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns)
)
num_df = pd.concat([num_df, encoded_df], axis=1)

# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_q.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_rb.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())

cols = [x for x in range(1 + len(num_columns))]
encoded_df = pd.DataFrame(encoded_features, columns=encoder_pol.get_feature_names_out(num_columns)).drop(num_columns,axis=1)
#encoded_df.columns = [encoded_df.columns[1 + len(num_columns):]]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, columns=encoder_kbd.get_feature_names_out(num_columns))
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


num_df.head(2)

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,...,1,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,100.35,5681.1,0.0,0.000749,0.141401,0.587844,0.263281,0.006725,0.0,0.009622,...,1.0,10070.1225,570098.385,32274897.21,1010537.0,57209370.0,3238786000.0,183356900000.0,4.0,3.0
1,103.7,5036.3,0.0,0.000335,0.117146,0.574238,0.298249,0.010033,0.0,0.020946,...,1.0,10753.69,522264.31,25364317.69,1115158.0,54158810.0,2630280000.0,127742300000.0,4.0,2.0


In [82]:
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

encoder_spl = SplineTransformer(n_knots=n_knots,degree=degree_spline)
encoder_q   = QuantileTransformer(n_quantiles=n_quantiles)
encoder_rb  = RobustScaler()
encoder_pol = PolynomialFeatures(degree=degree)
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)

numeric_transformer = ColumnTransformer(transformers=[
    ('spl', encoder_spl, num_columns), 
    ('q', encoder_q, num_columns), 
    ('rb', encoder_rb, num_columns), 
    ('pol', encoder_pol, num_columns), 
    ('kbd', encoder_kbd, num_columns)
])

cat_columns = ["type", "payment_method", "internet_service", "gender"]

encoder_oh = OneHotEncoder(
    categories="auto",
    drop="first",
    sparse_output=False,
    max_categories=10,
    handle_unknown="ignore"
    )

categorical_transformer = Pipeline(steps=[('encoder', encoder_oh)])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_columns), 
    ('cat', categorical_transformer, cat_columns)
], 
n_jobs=-1)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

df =  pd.concat([df, transformed_df], axis=1)
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__pol__total_charges^3,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__gender_Male
0,13,8091-TTVAX,2015-04-01,NaT,One year,No,Credit card (automatic),100.35,5681.1,Fiber optic,...,183356900000.0,4.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
1,14,0280-XJGEX,2015-09-01,2019-10-01,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Fiber optic,...,127742300000.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [83]:
preprocessor

In [84]:
os.environ['MLFLOW_S3_ENDPOINT_URL']='https://storage.yandexcloud.net'
os.environ['AWS_BUCKET_NAME']=os.environ.get("S3_BUCKET_NAME")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer") 



In [85]:
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__pol__total_charges^3,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__gender_Male
0,13,8091-TTVAX,2015-04-01,NaT,One year,No,Credit card (automatic),100.35,5681.1,Fiber optic,...,183356900000.0,4.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
1,14,0280-XJGEX,2015-09-01,2019-10-01,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Fiber optic,...,127742300000.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,15,5129-JLPIS,2018-01-01,NaT,Month-to-month,Yes,Electronic check,105.5,2686.05,Fiber optic,...,19379490000.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,17,3655-SNQYZ,2014-05-01,NaT,Two year,No,Credit card (automatic),113.25,7895.15,Fiber optic,...,492131500000.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,19,9959-WOFKT,2014-03-01,NaT,Two year,No,Bank transfer (automatic),106.7,7382.25,Fiber optic,...,402315000000.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [86]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    df.drop(['id','customer_id','begin_date','end_date','target'],axis=1),
    df['target'],
    stratify=df['target'])

In [87]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2',C=0.2)
pipline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
pipline.fit(X_tr, y_tr)

In [88]:
prediction = pipline.predict(X_val)
probas = pipline.predict_proba(X_val)

In [89]:
from sklearn.metrics import f1_score,roc_auc_score,recall_score,confusion_matrix,log_loss,precision_score

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_val,prediction,normalize='all').ravel()
auc = roc_auc_score(y_val,probas[:,1])
precision = precision_score(y_val,prediction)
recall = recall_score(y_val,prediction)
f1 = f1_score(y_val,prediction)
logloss = log_loss(y_val,prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

print(metrics)

{'err1': 0.10448608745031232, 'err2': 0.13798977853492334, 'auc': 0.734908108251227, 'precision': 0.5690866510538641, 'recall': 0.5203426124197003, 'f1': 0.5436241610738256, 'logloss': 8.350829405315046}


In [90]:
class CustomMlflowModel(mlflow.pyfunc.PythonModel):

    def __init__(self, model):
        super().__init__()
        self._model = model 
    
    def predict(self, context, model_input):
        probas = self._model.predict(model_input)
        
        return probas

custom_model = CustomMlflowModel(pipline)

In [91]:
RUN_NAME = "addfeatures_model_pipeline"
DIR_PATH = 'dataframe'
REGISTRY_MODEL_NAME = "churn_model_ujhmaster_addfeatures"

pip_requirements = "./requirements.txt"
metadata = {'model_type': 'monthly'}
input_example = X_val[:10]
signature = mlflow.models.infer_signature(X_val, prediction)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id
    
    # логируем метрики эксперимента
    mlflow.log_metrics(metrics)
    
    # логируем модель
    model_info = mlflow.sklearn.log_model( 
            sk_model=pipline, 
			pip_requirements=pip_requirements,
            signature=signature, 
            metadata=metadata,
            input_example=input_example, 
            artifact_path='models', 
            registered_model_name=REGISTRY_MODEL_NAME,
            await_registration_for=60
		)


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# получаем данные о запуске эксперимента по его уникальному идентификатору
run = mlflow.get_run(run_id)


# проверяем, что статус запуска эксперимента изменён на 'FINISHED'
# это утверждение (assert) можно использовать для автоматической проверки того, 
# что эксперимент был завершён успешно
assert 'FINISHED' == run.info.status

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'churn_model_ujhmaster_addfeatures'.
2024/09/09 21:26:01 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_ujhmaster_addfeatures, version 1
Created version '1' of model 'churn_model_ujhmaster_addfeatures'.


In [92]:
run

<Run: data=<RunData: metrics={'auc': 0.734908108251227,
 'err1': 0.10448608745031232,
 'err2': 0.13798977853492334,
 'f1': 0.5436241610738256,
 'logloss': 8.350829405315046,
 'precision': 0.5690866510538641,
 'recall': 0.5203426124197003}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "b3c4811f95a84358aced422597bfa438", '
                             '"artifact_path": "models", "utc_time_created": '
                             '"2024-09-09 21:26:01.337190", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"predict_fn": "predict", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.10.12", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "1.3.1", '
               