In [14]:
import os
import mlflow
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

TABLE_NAME = "users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "churn_model_fio"

In [15]:
import psycopg
from dotenv import load_dotenv
load_dotenv()

connection = {"target_session_attrs": "read-write"}#"sslmode": "verify-full"
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0


## Задача №1

In [37]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(
    categories='auto',
    drop='first',
    handle_unknown='ignore',
    max_categories=10,
    sparse_output=False
)

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns])

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(data=encoded_features, columns=encoder_oh.get_feature_names_out())

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = encoded_df#pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,type_One year,type_Two year,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,internet_service_Fiber optic,internet_service_None,gender_Male
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


## Задача №2

In [18]:
num_columns = ["monthly_charges", "total_charges"]
##
df = df.dropna(subset=num_columns)
##
n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None


# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out()
)
num_df = encoded_df#pd.concat([num_df, encoded_df], axis=1)


# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=100)
encoded_features = encoder_q.fit_transform(df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_q.get_feature_names_out()
)
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_rb.get_feature_names_out()
)
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree = 3)
encoded_features = encoder_pol.fit_transform(df[num_columns])

encoded_df = encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_pol.get_feature_names_out()
)
#encoded_df.columns = [encoded_df.columns[1 + len(num_columns):]]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(
    n_bins=n_bins, 
    encode=encode, 
    strategy=strategy, 
    subsample=subsample
)
encoded_features = encoder_kbd.fit_transform(df[num_columns])

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_kbd.get_feature_names_out()
)
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


num_df.head(2)

Unnamed: 0,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,...,total_charges,monthly_charges^2,monthly_charges total_charges,total_charges^2,monthly_charges^3,monthly_charges^2 total_charges,monthly_charges total_charges^2,total_charges^3,monthly_charges_bin,total_charges_bin
0,0.034259,0.433936,0.48159,0.050214,2.168151e-07,0.0,0.014515,0.334777,0.555277,0.095311,...,1022.95,426.4225,21123.9175,1046427.0,8805.624625,436208.896375,21608710.0,1070442000.0,0.0,0.0
1,0.023507,0.388355,0.519449,0.068676,1.316872e-05,0.0,0.016892,0.350947,0.545446,0.086646,...,894.3,622.5025,22312.785,799772.5,15531.437375,556703.98575,19954320.0,715236500.0,0.0,0.0


## Задача №3

In [19]:
numeric_transformer = ColumnTransformer(
    transformers = [
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns),
    ]
)

categorical_transformer = Pipeline(
    steps = [
        ('encoder', encoder_oh)
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns),
    ],
    n_jobs=-1
)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

#df = pd.concat([df, transformed_df], axis=1)
transformed_df.head(2)

Unnamed: 0,num__spl__monthly_charges_sp_0,num__spl__monthly_charges_sp_1,num__spl__monthly_charges_sp_2,num__spl__monthly_charges_sp_3,num__spl__monthly_charges_sp_4,num__spl__monthly_charges_sp_5,num__spl__total_charges_sp_0,num__spl__total_charges_sp_1,num__spl__total_charges_sp_2,num__spl__total_charges_sp_3,...,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male
0,0.034259,0.433936,0.48159,0.050214,2.168151e-07,0.0,0.014515,0.334777,0.555277,0.095311,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.023507,0.388355,0.519449,0.068676,1.316872e-05,0.0,0.016892,0.350947,0.545446,0.086646,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


## Задача №4

In [21]:
preprocessor

In [67]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.sklearn.log_model(preprocessor, "preprocessor") 



## Задача №5

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    df, df['target'], test_size=0.2, random_state=42, stratify=df['target'])

In [44]:
model = LogisticRegression()

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
pipeline.fit(X_train, y_train) 

In [48]:
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, roc_auc_score

preds = pipeline.predict(X_test)
probs = pipeline.predict_proba(X_test)[:,1]

metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, preds, normalize='all').ravel()
auc = roc_auc_score(y_test, probs)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
logloss = log_loss(y_test, preds)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
pip_requirements = "./requirements.txt"
signature = mlflow.models.infer_signature(X_test.values, y_test)
input_example = X_test.head(10)
metadata = {"model_type": "logistic_regression"}


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model(
        sk_model = model,
        artifact_path = "models",
        signature = signature,
        input_example = input_example,
        pip_requirements = pip_requirements,
        metadata = metadata,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60
    )

  outputs = _infer_schema(model_output) if model_output is not None else None
Successfully registered model 'churn_model_fio'.
2024/03/26 20:18:16 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_fio, version 1
Created version '1' of model 'churn_model_fio'.
