In [1]:
import warnings
warnings.filterwarnings("ignore", category=Warning)

import pickle
import structlog
from ml_assemblr.main_components.data_pod import DataPod
from home_credit_helper.config import cfg
from ml_assemblr.transfromer.model.xgb_model import XGBModel
from home_credit_helper.evaluation.main import evaluation

In [2]:
with open(cfg.research_cache_path / "05_feature_selection_states.pkl", "rb") as f:
    feature_selection_states: dict = pickle.load(f)

In [3]:
dp_untrain: DataPod = feature_selection_states["dp_untrain"]
feature_names = feature_selection_states["feature_names"]
current_feature_names = feature_selection_states["current_feature_names"]
low_important_feature_count = feature_selection_states["low_important_feature_count"]
metric_tolerance = feature_selection_states["metric_tolerance"]
total_original_features_count = feature_selection_states["total_original_features_count"]
xgb_config = feature_selection_states["xgb_config"]
auroc_full = feature_selection_states["auroc_full"]
count = feature_selection_states["count"] if "count" in feature_selection_states else 0


In [4]:
raise

RuntimeError: No active exception to reraise

In [7]:
logger = structlog.get_logger()

In [8]:
while count < len(feature_names):
    
    feature_name_to_eval = feature_names[count]
    
    logging_context = (
        {"evaluating_feature": feature_name_to_eval}
        if feature_name_to_eval not in set(("first_loop", "last_loop"))
        else {}
    )

    logger.info(
        f"{count + 1} / {len(feature_names)} recursive feature elimination processing",
        **logging_context,
    )

    feature_names_to_train = current_feature_names[:]
    if feature_name_to_eval not in set(("first_loop", "last_loop")):
        feature_names_to_train.remove(feature_name_to_eval)

    dp = dp_untrain.copy()

    dp.main_column_type.features = feature_names_to_train
    
    cv_count = len(dp.variables["cv_idx_map"]['cv_split_idx_in_column_type'])
    for i in range(cv_count):
        xgb_model = XGBModel(**xgb_config, fit_on_split="train", cv_idx=i)
        dp: DataPod = dp.fit_transform(xgb_model)
        
    dp = evaluation(dp, "auroc_valid", fast_mode=True)
    auroc_iteration = dp.variables["evaluation"]["objective_auroc_valid"]

    auroc_diff = auroc_full - auroc_iteration

    metric_context = {
        "auroc_diff": auroc_diff,
        "auroc_iteration": auroc_iteration,
        "auroc_full": auroc_full,
        "remaining_features": len(current_feature_names)
    }

    if feature_name_to_eval == "first_loop":
        logger.info(
            f"remove all features with 0 importance ({low_important_feature_count} features)",
            **metric_context,
        )
    elif feature_name_to_eval == "last_loop":
        logger.info(
            f"final auroc after recursive feature elimination",
            **metric_context,
        )
    elif auroc_diff >= metric_tolerance:
        logger.info(
            f"keep feature: '{feature_name_to_eval}'",
            **metric_context,
        )
    else:
        metric_context["remaining_features"] -= 1
        logger.info(
            f"remove feature: '{feature_name_to_eval}'",
            **metric_context,
        )
        current_feature_names.remove(feature_name_to_eval)
        auroc_full = max(auroc_full, auroc_iteration)
        
    count += 1
    
    if count % 5 == 0 or count == len(feature_names):
    
        feature_selection_states = {
            "dp_untrain": dp_untrain,
            "feature_names": feature_names,
            "current_feature_names": current_feature_names,
            "low_important_feature_count": low_important_feature_count,
            "metric_tolerance": metric_tolerance,
            "total_original_features_count": total_original_features_count,
            "xgb_config": xgb_config,
            "auroc_full": auroc_full,
            "count": count
        }
        with open(cfg.research_cache_path / "05_feature_selection_states.pkl", "wb") as f:
            pickle.dump(feature_selection_states, f)
        logger.info(
            f"save feature_selection_states", **{"count":count, "recent_evaluated_features": feature_name_to_eval}
        )
    


logger.info("DONE!!")
logger.info(
    f"remaining {len(current_feature_names)} features from original {total_original_features_count} features"
)

2024-04-21 16:19:29 [info     ] 51 / 448 recursive feature elimination processing evaluating_feature=LIVINGAPARTMENTS_AVG
2024-04-21 16:20:20 [info     ] keep feature: 'LIVINGAPARTMENTS_AVG' auroc_diff=0.000991475478971715 auroc_full=0.7729092416754991 auroc_iteration=0.7719177661965274 remaining_features=408
2024-04-21 16:20:20 [info     ] 52 / 448 recursive feature elimination processing evaluating_feature=PREV_APPS_REFUSED_NAME_SELLER_INDUSTRY_INFREQUENT_SKLEARN_MEAN
2024-04-21 16:21:15 [info     ] keep feature: 'PREV_APPS_REFUSED_NAME_SELLER_INDUSTRY_INFREQUENT_SKLEARN_MEAN' auroc_diff=0.0007050271763884286 auroc_full=0.7729092416754991 auroc_iteration=0.7722042144991107 remaining_features=408
2024-04-21 16:21:15 [info     ] 53 / 448 recursive feature elimination processing evaluating_feature=PREV_APPS_APPROVED_PRODUCT_COMBINATION_CASH_X_SELL_LOW_MEAN
2024-04-21 16:22:11 [info     ] keep feature: 'PREV_APPS_APPROVED_PRODUCT_COMBINATION_CASH_X_SELL_LOW_MEAN' auroc_diff=0.00123663931

KeyboardInterrupt: 

In [9]:
remaining_feature_names = current_feature_names 
print(len(remaining_feature_names))
remaining_feature_names[-10:]

300


['OWN_CAR_AGE',
 'NAME_EDUCATION_TYPE_HIGHER_EDUCATION',
 'PAYMENT_RATE',
 'CODE_GENDER_M',
 'INT_PAY_DPD_MEAN',
 'DAYS_EMPLOYED_PERC',
 'AMT_GOODS_PRICE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'EXT_SOURCE_2']

In [10]:
with open(cfg.research_cache_path / "06_selected_features.pkl", "wb") as f:
    pickle.dump(remaining_feature_names, f)