In [8]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, log_loss
from sklearn.impute import SimpleImputer
import os
import mlflow
from dotenv import load_dotenv
from joblib import Memory
import joblib
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from catboost import CatBoostClassifier, Pool
from tqdm.notebook import tqdm
from utils.classes import DropColumns, ProcessNumericColumns, ProcessCategoricalColumns, \
ProcessDateColumns, ProcessBooleanColumns, RemoveOutliers, LabelEncodeColumns

In [2]:
load_dotenv()

True

# Предварительная обработка данных

In [3]:
# Загружаем данные

In [4]:
data = pd.read_csv("data/train_ver2.csv", low_memory=False)

In [9]:
# Для каждого шага предобработки данных напишем свой трансформер
# В ./utils/classes.py

In [7]:
data['segmento'].value_counts()

segmento
02 - PARTICULARES     7960220
03 - UNIVERSITARIO    4935579
01 - TOP               562142
Name: count, dtype: int64

In [8]:
# Определяем стобцы для трансформации
numeric_columns = ['age', 'antiguedad', 'renta']
datetime_columns = ['fecha_dato', 'fecha_alta']
columns_to_drop = ['tipodom', 'nomprov', 'ult_fec_cli_1t', 'conyuemp']

categorical_columns = [
    'ind_empleado', 'pais_residencia', 'indrel_1mes', 
    'tiprel_1mes', 'canal_entrada', 'segmento'
]

boolean_columns = [
    col for col in data.columns if data[col].nunique() == 2 and not col in ['conyuemp']
]

In [9]:
# Приводим типы, заполняем пропущеные значения, удаляем выбросы
preprocess_data_pipeline = Pipeline([
    ('drop_columns', DropColumns(columns=columns_to_drop)),
    ('process_numeric', ProcessNumericColumns(columns=numeric_columns)),
    ('remove_outliers', RemoveOutliers(columns=numeric_columns)),
    ('process_categorical', ProcessCategoricalColumns(columns=categorical_columns)),
    ('process_boolean', ProcessBooleanColumns(columns=boolean_columns)),
    ('process_dates', ProcessDateColumns(columns=datetime_columns)),
])

# Выполняем преобразования стобцов
column_transformer = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numeric_columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Собираем в пайплайн
pipeline = Pipeline([
    ('data_preprocessing', preprocess_data_pipeline),
    ('feature_transformation', column_transformer)
])

In [10]:
processed_data = pipeline.fit_transform(data)

  X[col] = X[col].fillna(False)
  X[col] = X[col].fillna(False)
  X[col] = X[col].fillna(False)
  X[col] = X[col].fillna(False)


In [11]:
feature_names = column_transformer.get_feature_names_out()
data_transformed = pd.DataFrame(processed_data, columns=feature_names)

In [None]:
print(feature_names)

In [12]:
del data
del processed_data

In [13]:
# Сохраним данные локально и в mlflow
local_artifact_dir = "artifacts"
os.makedirs(local_artifact_dir, exist_ok=True)

pipeline_path = os.path.join(local_artifact_dir, "pipeline.pkl")
joblib.dump(pipeline, pipeline_path)

['artifacts/pipeline.pkl']

In [14]:
data_transformed.to_parquet("data/data_transformed.parquet")

In [27]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "final_pr_eda_experiment"
RUN_NAME = "eda"

assets_dir = "assets"
os.makedirs(assets_dir, exist_ok=True)

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [16]:
if mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
else:
    experiment_id = mlflow.create_experiment(name=EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_param("imputer_strategy", "constant")
    mlflow.log_artifact(pipeline_path, artifact_path="models")

🏃 View run eda at: http://127.0.0.1:5000/#/experiments/1/runs/8229e6c2fcc54161855e9695a55a303a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [17]:
del data_transformed

# Обучение моделей

In [3]:
# Загрузим подготовленные данные
data = pd.read_parquet("data/data_transformed.parquet")

In [4]:
data.head(3)

Unnamed: 0,age,antiguedad,renta,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,fecha_alta,ind_nuevo,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,-0.25845,-1.076658,-0.364576,2015-01-28,1375586,N,ES,1,2015-01-12,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.996262,-0.632153,-1.581645,2015-01-28,1050611,N,ES,0,2012-08-10,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.996262,-0.632153,0.458929,2015-01-28,1050612,N,ES,0,2012-08-10,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Преобразуем данные и оставим в качестве признаков "fecha_dato", "age", "renta", "sexo"

In [5]:
product_cols = [col for col in data.columns if col.endswith("_ult1")]

interactions = data[["ncodpers", "fecha_dato", "age", "renta", "sexo"] + product_cols]

In [6]:
del data

In [7]:
# Разобъем данные по времени
train_test_global_time_split_date = pd.Timestamp("2016-01-01")

train_test_global_time_split_idx = interactions["fecha_dato"] < train_test_global_time_split_date
interactions_train = interactions[train_test_global_time_split_idx]
interactions_test = interactions[~train_test_global_time_split_idx]

In [8]:
del interactions

In [9]:
# Количество клиентов в train и test
clients_train = interactions_train["ncodpers"].drop_duplicates()
clients_test = interactions_test["ncodpers"].drop_duplicates()

print('Количество пользователей в train:', len(clients_train))
print('Количество пользователей в test:', len(clients_test))

Количество пользователей в train: 846293
Количество пользователей в test: 857043


In [10]:
interactions_train.sample(3)

Unnamed: 0,ncodpers,fecha_dato,age,renta,sexo,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1301594,179535,2015-03-28,1.094206,-0.714311,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1111652,1031231,2015-02-28,-0.996262,-0.647322,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3148781,294614,2015-06-28,0.29491,-1.592502,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
X_train = interactions_train.drop(columns=['fecha_dato', 'ncodpers'] + product_cols)
y_train = interactions_train[product_cols]

X_train_sample = X_train.sample(frac=0.1, random_state=42) # Для упрощения работы оставим 10% данных, иначе падает ядро
y_train_sample = y_train.loc[X_train_sample.index]

In [15]:
train_pool = Pool(
    data=X_train_sample,
    label=y_train_sample,
    # cat_features=['segmento']
)

In [16]:
cb_model = CatBoostClassifier(
    learning_rate=0.1,
    iterations=100,
    loss_function='MultiLogloss',
    verbose=10,
    thread_count=-1,
    random_seed=42
)

cb_model.fit(train_pool)

0:	learn: 0.6125787	total: 799ms	remaining: 1m 19s
10:	learn: 0.2612970	total: 7.34s	remaining: 59.4s
20:	learn: 0.1727913	total: 13.8s	remaining: 51.8s
30:	learn: 0.1462277	total: 20.4s	remaining: 45.4s
40:	learn: 0.1378228	total: 27s	remaining: 38.8s
50:	learn: 0.1350306	total: 33.8s	remaining: 32.4s
60:	learn: 0.1340344	total: 40.5s	remaining: 25.9s
70:	learn: 0.1336518	total: 47.1s	remaining: 19.2s
80:	learn: 0.1334636	total: 53.7s	remaining: 12.6s
90:	learn: 0.1333795	total: 1m	remaining: 5.94s
99:	learn: 0.1333151	total: 1m 6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f9e92f85ab0>

In [17]:
# Посчитаем метрики на тестовой выборке
X_test = interactions_test.drop(columns=product_cols)
y_test = interactions_test[product_cols]

test_pool = Pool(
    data=X_test,
    label=y_test,
    # cat_features=['segmento']
)

In [18]:
preds = cb_model.predict_proba(test_pool)
predicted_labels = (preds > 0.5).astype(int)
predicted_df = pd.DataFrame(predicted_labels, columns=product_cols)

In [20]:
accuracy = accuracy_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels, average='weighted')
precision = precision_score(y_test, predicted_labels, average='weighted')
recall = recall_score(y_test, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1-Score (Weighted): {f1}")
print(f"Precision (Weighted): {precision}")
print(f"Recall (Weighted): {recall}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.43838484002362815
F1-Score (Weighted): 0.3648248891142275
Precision (Weighted): 0.29607137863673366
Recall (Weighted): 0.47584798173915055


In [22]:
metrics = {
    "accuracy": accuracy,
    "f1": f1,
    "precision": precision,
    "recall": recall
}

In [23]:
cb_model.save_model('artifacts/catboost_model.bin')

In [25]:
pip_requirements = './requirements.txt'
input_example = X_test[:10]

In [28]:
REGISTRY_MODEL_NAME = 'baseline_model'
if mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
else:
    experiment_id = mlflow.create_experiment(name=EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)

    model_info = mlflow.sklearn.log_model( 
        sk_model=cb_model,
        artifact_path="artifacts",
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        input_example=input_example,
        await_registration_for=60,
    )

Registered model 'baseline_model' already exists. Creating a new version of this model...
2024/12/01 14:51:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 60 seconds for model version to finish creation. Model name: baseline_model, version 2


🏃 View run eda at: http://127.0.0.1:5000/#/experiments/1/runs/d10e838a730d4f048edfc454b424d264
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '2' of model 'baseline_model'.
