Inspired by https://www.kaggle.com/code/mavillan/lofo-feat-selection-with-lightgbm.

Version with GroupKFold setup.

In [None]:
import warnings
import pandas as pd
import seaborn as sb
from pathlib import Path
import matplotlib.pyplot as plt
from warnings import simplefilter
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder
from catboost import (CatBoostRegressor, Pool,
                      EShapCalcType, EFeaturesSelectionAlgorithm, EFstrType)

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
!jupyter nbextension enable --py widgetsnbextension

In [None]:
%%time
path_raw = Path("/kaggle/input/um-game-playing-strength-of-mcts-variants")

df_train = pd.read_csv(path_raw / "train.csv")
df_train.shape

In [None]:
def preprocess_data(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame = None,
    scale_utility: bool = True
):
    # Splitting agent1 and agent2 into their subfields
    def split_agent_fields(df):
        agent1_cols = ['agent1_selection', 'agent1_exploration_const', 'agent1_playout', 'agent1_score_bounds']
        agent2_cols = ['agent2_selection', 'agent2_exploration_const', 'agent2_playout', 'agent2_score_bounds']
        df[agent1_cols] = df['agent1'].str.split('-', expand=True).iloc[:, 1:]
        df[agent2_cols] = df['agent2'].str.split('-', expand=True).iloc[:, 1:]
        return df

    df_train = split_agent_fields(df_train)
    if df_test is not None:
        df_test = split_agent_fields(df_test)

    # Identify numerical and categorical columns
    numerical_cols = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_train.select_dtypes(include=['object']).columns.tolist()

    # Exclude Id, target columns and EnglishRules, LudRules from categoricals
    numerical_cols = [
        col for col in numerical_cols 
        if col not in ['Id', 'num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1', 'utility_agent1']
    ]
    categorical_cols = [
        col for col in categorical_cols 
        if col not in ['GameRulesetName','EnglishRules', 'LudRules']
    ]

    # Remove all NaN/null numerical columns
    all_nan_cols = df_train[numerical_cols].columns[df_train[numerical_cols].isna().all()]
    numerical_cols = [col for col in numerical_cols if col not in all_nan_cols.tolist()]

    # Remove constant columns
    constant_cols = df_train[numerical_cols].std()[df_train[numerical_cols].std() == 0].index.tolist()
    numerical_cols = [col for col in numerical_cols if col not in constant_cols]

    # Apply ordinal encoding to categorical columns
    encoder = OrdinalEncoder()
    df_train[categorical_cols] = encoder.fit_transform(df_train[categorical_cols])
    df_train[categorical_cols] = df_train[categorical_cols].astype(int)
    if df_test is not None:
        df_test[categorical_cols] = encoder.transform(df_test[categorical_cols])
        df_test[categorical_cols] = df_test[categorical_cols].astype(int)

    # Scale the target variable 'utility_agent1' to be between 0 and 1 if scale_utility is True
    if scale_utility:
        min_utility = -1.0
        max_utility = 1.0
        df_train['utility_agent1_scaled'] = df_train.eval(
            "(utility_agent1 - @min_utility) / (@max_utility - @min_utility)"
        )

    if df_test is not None:
        return df_train, df_test, numerical_cols, categorical_cols
    else:
        return df_train, numerical_cols, categorical_cols

In [None]:
%%time
groups = df_train['GameRulesetName']
df_train, numerical_cols, categorical_cols = preprocess_data(
    df_train,
    scale_utility=False
)

print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

In [None]:
%%time
print("Feature Elimination Performing...")
NUM_FEATURES_TO_DROP = 20
STEPS_TO_SELECT = 3
NUM_FOLD_TO_SELECT = 1
FOLDS = 5

cv = GroupKFold(n_splits=FOLDS)
X = df_train[numerical_cols+categorical_cols]
y = df_train["utility_agent1"]

for fi, (train_idx, valid_idx) in enumerate(cv.split(X, y, groups)):
    print("#"*25)
    print(f"### Fold {fi+1}/{FOLDS} ...")
    print("#"*25) 
    train_pool = Pool(X.iloc[train_idx], y.iloc[train_idx], cat_features=categorical_cols)
    val_pool = Pool(X.iloc[valid_idx], y.iloc[valid_idx], cat_features=categorical_cols)

    ctb_params = dict(iterations=5000,
                      learning_rate=0.03,
                      loss_function='RMSE',
                      eval_metric = 'RMSE',
                      metric_period=200,
                      od_type='Iter',
                      od_wait=100,
                      task_type='GPU',
                      allow_writing_files=False,
                      )
    ctb_model = CatBoostRegressor(**ctb_params)

    summary = ctb_model.select_features(
        train_pool,
        eval_set=val_pool,
        features_for_select=numerical_cols+categorical_cols,
        num_features_to_select=len(numerical_cols+categorical_cols)-NUM_FEATURES_TO_DROP,
        steps=STEPS_TO_SELECT,
        algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=False,
        plot=True,
    )
    
    if fi == NUM_FOLD_TO_SELECT-1:
        break

In [None]:
print('Initial features len:', X.shape[1])
print('Selected features len:', X[summary['selected_features_names']].shape[1],'\n')
print('Eliminated features names:\n\n', summary['eliminated_features_names'])

In [None]:
%%time
print("Model Training on Selected Features Subset.")
cat_features = [c for c in categorical_cols if c not in summary['eliminated_features_names']]

train_pool = Pool(X.iloc[train_idx][summary['selected_features_names']], y.iloc[train_idx], cat_features=cat_features)
val_pool = Pool(X.iloc[valid_idx][summary['selected_features_names']], y.iloc[valid_idx], cat_features=cat_features)

ctb_model = CatBoostRegressor(**ctb_params)
ctb_model.fit(train_pool, eval_set=val_pool, use_best_model=True)

In [None]:
# Plot top n features
TOP_N = 10
feat_importances = ctb_model.get_feature_importance(prettified=True)
plt.figure(figsize=(12, 4))
sb.barplot(x="Importances", y="Feature Id", data=feat_importances[:TOP_N])
plt.title('CatBoost features importance:')
plt.tight_layout()

In [None]:
# Calculate top n feat interactions
TOP_N = 10
feat_interactions = ctb_model.get_feature_importance(type=EFstrType.Interaction, prettified=True)
top_interactions = feat_interactions[:TOP_N]
top_interactions['First Feature Index'] = top_interactions['First Feature Index'].apply(lambda x: summary['selected_features_names'][x])
top_interactions['Second Feature Index'] = top_interactions['Second Feature Index'].apply(lambda x: summary['selected_features_names'][x])
top_interactions.columns = ['First Feature', 'Second Feature', 'Interaction']
top_interactions