In [4]:
import os
import psycopg
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "users_churn"

connection = {"target_session_attrs": "read-write"}#"sslmode": "verify-full"
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,17,8191-XWSZG,2015-10-01,NaT,One year,No,Mailed check,20.65,1022.95,,...,,,,,Female,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,...,,,,,Female,0,Yes,Yes,Yes,0


In [7]:
from sklearn.model_selection import train_test_split

features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    "monthly_charges",
    "total_charges"
]

target = 'target'

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

## Задача №1

In [14]:
from autofeat import AutoFeatClassifier

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

transformations = ('1/', 'log', 'abs', 'sqrt')

afc = AutoFeatClassifier(
    categorical_cols=cat_features,
    transformations=transformations,
    feateng_steps=1,
    n_jobs=-1
)

X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)

## Задача №2

In [24]:
import mlflow

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_fio"
RUN_NAME = "autofeat" 
#REGISTRY_MODEL_NAME = "churn_model_fio"

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
#mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    
artifact_path = "afc"

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path)

2024-03-28 17:58:41,199 INFO: Found credentials in environment variables.


## Задача №3

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, confusion_matrix, roc_auc_score


model = LogisticRegression()
model.fit(X_train_features, y_train)

nan_mask = X_test_features.isna().any(axis=1)
X_test_features = X_test_features[~nan_mask]
y_test = y_test.reset_index(drop=True)[~nan_mask]

preds = model.predict(X_test_features)
probs = model.predict_proba(X_test_features)[:,1]

metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, preds, normalize='all').ravel()
auc = roc_auc_score(y_test, probs)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
logloss = log_loss(y_test, preds)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
REGISTRY_MODEL_NAME = "churn_model_fio"

mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")


pip_requirements = "./requirements.txt"
signature = mlflow.models.infer_signature(X_test_features.values, y_test)
input_example = X_test_features.head(10)
metadata = {"model_type": "logistic_regression"}


with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model(
        sk_model = model,
        artifact_path = "models",
        signature = signature,
        input_example = input_example,
        pip_requirements = pip_requirements,
        metadata = metadata,
        registered_model_name=REGISTRY_MODEL_NAME,
        await_registration_for=60
    )

Registered model 'churn_model_fio' already exists. Creating a new version of this model...
2024/03/28 18:09:05 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_fio, version 2
Created version '2' of model 'churn_model_fio'.
