In [4]:
import logging
import os
from typing import Any, Dict, List
import catboost as cb
import optuna
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

from configs.config import settings
from models.model_validator import create_validator
from utils.basic_utils import save_pickle, save_shap, save_toml

train_data = 'D:\python\1_Internship\scoring_pipeline\data\congo_featuretools1807_prepared.parquet'

class Tuning:
    def __init__(
        self,
        X_train: pd.DataFrame,
        y_train: pd.Series,
        X_test: pd.DataFrame,
        y_test: pd.Series,
    ) -> None:
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        self.feature_list = settings.FEATURES.feature_list
        self.cat_feature_list = settings.FEATURES.cat_feature_list

    def objective(self, trial, optimal_difference) -> float:
        params = {}
        list_params = ['bootstrap_type', 'boosting_type']
        int_params = ['depth', 'n_estimators']

        for param in settings.HYPERPARAMETERS.tuning:
            if param in list_params:
                params[param] = trial.suggest_categorical(
                    param, settings.HYPERPARAMETERS.tuning[param]
                )
            elif param in int_params:
                params[param] = trial.suggest_int(
                    param,
                    settings.HYPERPARAMETERS.tuning[param][0],
                    settings.HYPERPARAMETERS.tuning[param][1],
                )
            elif param == 'l2_leaf_reg':
                start, end, step = settings.HYPERPARAMETERS.tuning[param]
                params[param] = trial.suggest_float(param, start, end, step=step)
            elif param == 'learning_rate':
                start, end = settings.HYPERPARAMETERS.tuning[param]
                params[param] = trial.suggest_float(param, start, end)
            else:
                params[param] = trial.suggest_float(
                    param,
                    settings.HYPERPARAMETERS.tuning[param][0],
                    settings.HYPERPARAMETERS.tuning[param][1],
                )

        model = cb.CatBoostClassifier(
            **params,
            random_seed=settings.HYPERPARAMETERS.random_seed,
            cat_features=self.cat_feature_list,
            verbose=False,
        )
        model.fit(self.X_train, self.y_train, eval_set=(self.X_test, self.y_test), early_stopping_rounds=50, verbose=False)

        train_auc = (
            roc_auc_score(self.y_train, model.predict_proba(self.X_train)[:, 1]) * 100
        )
        test_auc = (
            roc_auc_score(self.y_test, model.predict_proba(self.X_test)[:, 1]) * 100
        )
        difference = abs(train_auc - test_auc)
        difference_penalty = abs(difference - optimal_difference)
        penalty_weight = 3

        obj_val = test_auc - (difference_penalty ** penalty_weight)

        logging.info(f"\n>>> Train AUC: {train_auc}, Test AUC: {test_auc}, Difference: {difference} <<<")
        return obj_val

    def optimize_hyperparameters(self) -> Dict[str, Any]:
        mx_test_auc, final_params = 0, None

        for optimal_difference in range(3, 7):
            study = optuna.create_study(direction='maximize')
            study.optimize(
                lambda trial: self.objective(trial, optimal_difference), n_trials=30
            )

            best_params = study.best_params
            model = cb.CatBoostClassifier(
                **best_params,
                random_seed=settings.HYPERPARAMETERS.random_seed,
                cat_features=self.cat_feature_list,
                verbose=False,
            )
            model.fit(self.X_train, self.y_train, verbose=False)

            train_auc = (
                    roc_auc_score(self.y_train, model.predict_proba(self.X_train)[:, 1])
                    * 100
            )
            test_auc = (
                    roc_auc_score(self.y_test, model.predict_proba(self.X_test)[:, 1]) * 100
            )
            logging.info(
                f"#### Optimal Difference {optimal_difference}, Train AUC: {train_auc}, Test AUC: {test_auc} ####\n\n"
            )

            if test_auc > mx_test_auc and (final_params is None or abs(train_auc - test_auc) <= 6):
                mx_test_auc = test_auc
                final_params = best_params
                logging.info(f"New optimal parameters found: {final_params}")
                logging.info(
                    f"Optimal Test AUC: {test_auc}, Train AUC: {train_auc}, Difference: {abs(train_auc - test_auc)}")

        logging.info(f"Final optimal parameters: {final_params}")
        logging.info(f"Final optimal Test AUC: {mx_test_auc}")

        return final_params

def fit(df: pd.DataFrame, run_time, model_path, run_iteration) -> cb.CatBoostClassifier:
    """
    This function splits the input dataframe into train and test, initializes
    a CatBoostClassifier and fits it on the train data while evaluating on the test data then save the model artifact.

    Args:
        df (pd.DataFrame): main_sample df with defined factors and is_train bool.

    Returns:
        object: The trained CatBoost model
    """

    feature_list = settings.FEATURES.feature_list
    cat_feature_list = settings.FEATURES.cat_feature_list

    # split into train and test
    X_train = df.loc[df['is_train'] == 1].reset_index(drop=True)[feature_list]
    y_train = df.loc[df['is_train'] == 1, ['target']].reset_index(drop=True)

    X_test = df.loc[df['is_train'] == 0].reset_index(drop=True)[feature_list]
    y_test = df.loc[df['is_train'] == 0, ['target']].reset_index(drop=True)

    # init model and fit
    logging.info('------- Fitting the model...')

    if settings.TUNING.enabled:
        tuning_obj = Tuning(X_train, y_train, X_test, y_test)
        best_params = tuning_obj.optimize_hyperparameters()
        settings.HYPERPARAMETERS.training = best_params

    # Проверяем, включена ли кросс-валидация
    if settings.MODEL.Cross_validation_enabled:
        auc_scores = cross_validation(df, settings.HYPERPARAMETERS.training)
        logging.info(f'Cross-validation AUC scores: {auc_scores}')
        logging.info(f'Mean Cross-validation AUC: {np.mean(auc_scores)}')

    cbm = cb.CatBoostClassifier(
        **settings.HYPERPARAMETERS.training,
        random_seed=settings.HYPERPARAMETERS.random_seed,
        cat_features=cat_feature_list,
        verbose=False,
    )
    cbm.fit(X_train, y_train, verbose=False)

    train_auc = roc_auc_score(y_train, cbm.predict_proba(X_train)[:, 1]) * 100
    test_auc = roc_auc_score(y_test, cbm.predict_proba(X_test)[:, 1]) * 100
    difference = abs(train_auc - test_auc)

    logging.info(f"Final model training complete.")
    logging.info(f"Final parameters: {settings.HYPERPARAMETERS.training}")
    logging.info(f"Train AUC: {train_auc}, Test AUC: {test_auc}, Difference: {difference}")

    # create a timestamp for the current run

    # create the directory for the current run
    run_dir = os.path.join(os.getcwd(), settings.PATHS.artifacts, f'run_{run_iteration}')
    model_artifact_dir = f'{run_dir}/model_artifact'
    try:
        create_validator(
            model_artifact_dir,
            df[settings.FEATURES.feature_list],
            'target',
            'variable_validator',
        )
        save_toml(model_artifact_dir)
        # save model in pickle file
        save_pickle(cbm, model_path)
        logging.info('------- Model saved...')
    except OSError:
        os.makedirs(run_dir, exist_ok=True)
        os.makedirs(model_artifact_dir)
        shap_path = f'{model_artifact_dir}/{settings.METADATA.model_name}_explainer.bz2'
        create_validator(
            model_artifact_dir,
            df[settings.FEATURES.feature_list],
            'target',
            'variable_validator',
        )
        save_toml(model_artifact_dir)
        # save model in pickle file
        save_pickle(cbm, model_path)
        logging.info('------- Model saved...')

        save_shap(cbm, shap_path)
        logging.info('------- Shap explainer saved...')
    return cbm

def cross_validation(
        df: pd.DataFrame, model_params: Dict[str, Any], n_splits: int = 5, random_state: int = 42
    ) -> List[float]:
    """
    Perform cross-validation using CatBoostClassifier.

    Args:
    df (pd.DataFrame): DataFrame containing all features and the target variable.
    model_params (Dict[str, Any]): Parameters to be used for CatBoostClassifier.
    n_splits (int): Number of splits for cross-validation.
    random_state (int): Random state for reproducibility.

    Returns:
    List[float]: List of AUC scores for each fold.

    This function performs k-fold cross-validation using CatBoostClassifier and returns the AUC scores for each fold.
    """

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    auc_scores = []

   


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject