## Загрузка данных

In [1]:
import os
import psycopg
import pandas as pd
from dotenv import load_dotenv
load_dotenv()


connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"), 
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

# определим название таблицы, в которой хранятся наши данные.
TABLE_NAME = "users_churn"

# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
        # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

        # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0
2,148,6837-BJYDQ,2019-11-01,NaT,One year,No,Mailed check,19.6,61.35,,...,,,,,Male,0,No,No,No,0
3,482,0486-LGCCH,2019-03-01,NaT,Two year,No,Mailed check,19.65,225.75,,...,,,,,Male,0,Yes,Yes,No,0
4,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0


## Обучение модели

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
df['senior_citizen'] = df['senior_citizen'].map({1: 'Yes', 0: 'No'})

numeric_features = df.select_dtypes(['float']).columns.tolist()
cat_features = df.select_dtypes(include='object').columns.drop('customer_id').tolist()

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop='if_binary'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, cat_features),
    ],
    remainder='drop'
)

model = LogisticRegression()

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
pipeline.fit(df_train, df_train['target']) 

### Пререквизиты:

In [5]:
model = pipeline

In [6]:
preds = model.predict(df_test)
probs = model.predict_proba(df_test)[:,1]

In [7]:
X_test = df_test
y_test = df_test['target']

## Задача №1

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, roc_auc_score
# импортируйте необходимые вам модули

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, preds, normalize='all').ravel()
auc = roc_auc_score(y_test, probs)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
logloss = log_loss(y_test, preds)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

## Задача №2

## Задача №3

In [10]:
import mlflow
import warnings
warnings.filterwarnings('ignore')


EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_fio"

uri = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(uri)
mlflow.set_registry_uri(uri)

pip_requirements = "./requirements.txt"
signature = mlflow.models.infer_signature(X_test.values, y_test)
input_example = X_test.head(10)
metadata = {"model_type": "logistic_regression"}

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model(
        sk_model = model,
        artifact_path = "models",
        signature = signature,
        input_example = input_example,
        pip_requirements = pip_requirements,
        metadata = metadata,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60
    )

Registered model 'churn_model_fio' already exists. Creating a new version of this model...
2024/03/22 20:27:33 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_fio, version 3
Created version '3' of model 'churn_model_fio'.


## Задача №4

In [72]:
loaded_model = mlflow.sklearn.load_model(model_uri=model_info.model_uri)
model_predictions = loaded_model.predict(X_test)

assert model_predictions.dtype == int

print(model_predictions[:10])

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

[0 0 0 0 0 0 0 0 0 0]


## Задача №5

In [20]:
client = mlflow.MlflowClient()

models = client.search_model_versions(filter_string=f"name = '{REGISTRY_MODEL_NAME}'")
print(f"Model info:\n {models}")

model_name_1 = models[-1].name
model_version_1 = models[-1].version
model_stage_1 = models[-1].current_stage

model_name_2 = models[-2].name
model_version_2 = models[-2].version
model_stage_2 = models[-2].current_stage


print(f"Текущий stage модели 1: {model_stage_1}")
print(f"Текущий stage модели 2: {model_stage_2}")

# поменяйте статус каждой модели
client.transition_model_version_stage(model_name_1, model_version_1, "production")
client.transition_model_version_stage(model_name_2, model_version_2, "staging") 

# переимнуйте модель в реестре
client.rename_registered_model(name=REGISTRY_MODEL_NAME, new_name=f"{REGISTRY_MODEL_NAME}_b2c") 

Model info:
 [<ModelVersion: aliases=[], creation_timestamp=1711139253536, current_stage='None', description='', last_updated_timestamp=1711139253536, name='churn_model_fio', run_id='b402c0a52e3d4686bcf245bd571822d6', run_link='', source='s3://s3-student-mle-20240212-2f26464482/8/b402c0a52e3d4686bcf245bd571822d6/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='3'>, <ModelVersion: aliases=[], creation_timestamp=1711055508379, current_stage='Production', description='', last_updated_timestamp=1711122291575, name='churn_model_fio', run_id='5a05ff50324241a6ba13ab30840f235a', run_link='', source='s3://s3-student-mle-20240212-2f26464482/8/5a05ff50324241a6ba13ab30840f235a/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>, <ModelVersion: aliases=[], creation_timestamp=1711054924463, current_stage='Staging', description='', last_updated_timestamp=1711122267394, name='churn_model_fio', run_id='60152352a6b144abb766c0c518b39a