In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GroupKFold

import lightgbm as lgb
print(f"LightGBM version: {lgb.__version__}")

from lofo import LOFOImportance, Dataset, plot_importance

In [None]:
def preprocess_data(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame = None,
    scale_utility: bool = True
):
    # Splitting agent1 and agent2 into their subfields
    def split_agent_fields(df):
        agent1_cols = ['agent1_selection', 'agent1_exploration_const', 'agent1_playout', 'agent1_score_bounds']
        agent2_cols = ['agent2_selection', 'agent2_exploration_const', 'agent2_playout', 'agent2_score_bounds']
        df[agent1_cols] = df['agent1'].str.split('-', expand=True).iloc[:, 1:]
        df[agent2_cols] = df['agent2'].str.split('-', expand=True).iloc[:, 1:]
        return df

    df_train = split_agent_fields(df_train)
    if df_test is not None:
        df_test = split_agent_fields(df_test)

    # Identify numerical and categorical columns
    numerical_cols = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_train.select_dtypes(include=['object']).columns.tolist()

    # Exclude Id, target columns and EnglishRules, LudRules from categoricals
    numerical_cols = [
        col for col in numerical_cols 
        if col not in ['Id', 'num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1', 'utility_agent1']
    ]
    categorical_cols = [
        col for col in categorical_cols 
        if col not in ['GameRulesetName','EnglishRules', 'LudRules']
    ]

    # Remove all NaN/null numerical columns
    all_nan_cols = df_train[numerical_cols].columns[df_train[numerical_cols].isna().all()]
    numerical_cols = [col for col in numerical_cols if col not in all_nan_cols.tolist()]

    # Remove constant columns
    constant_cols = df_train[numerical_cols].std()[df_train[numerical_cols].std() == 0].index.tolist()
    numerical_cols = [col for col in numerical_cols if col not in constant_cols]

    # Apply ordinal encoding to categorical columns
    encoder = OrdinalEncoder()
    df_train[categorical_cols] = encoder.fit_transform(df_train[categorical_cols])
    df_train[categorical_cols] = df_train[categorical_cols].astype(int)
    if df_test is not None:
        df_test[categorical_cols] = encoder.transform(df_test[categorical_cols])
        df_test[categorical_cols] = df_test[categorical_cols].astype(int)

    # Scale the target variable 'utility_agent1' to be between 0 and 1 if scale_utility is True
    if scale_utility:
        min_utility = -1.0
        max_utility = 1.0
        df_train['utility_agent1_scaled'] = df_train.eval(
            "(utility_agent1 - @min_utility) / (@max_utility - @min_utility)"
        )

    if df_test is not None:
        return df_train, df_test, numerical_cols, categorical_cols
    else:
        return df_train, numerical_cols, categorical_cols


***
### load and preprocess data

In [None]:
# define some paths
path_raw = Path("/kaggle/input/um-game-playing-strength-of-mcts-variants")

# load data
df_train = pd.read_csv(path_raw / "train.csv")
df_train

In [None]:
# Call the function
df_train, numerical_cols, categorical_cols = preprocess_data(
    df_train,
    scale_utility=False
)

# Print the results
print("Numerical Columns:", len(numerical_cols))
print("Categorical Columns:", len(categorical_cols))

In [None]:
plt.hist(df_train['utility_agent1'], bins=100)
plt.title('Distribution of Target Variable')
plt.xlabel('Utility Agent 1')
plt.ylabel('Frequency')
plt.show()



***
### train model


In [None]:
# input dataset
lofo_dset = Dataset(
    df = df_train,
    target = "utility_agent1",
    features = numerical_cols + categorical_cols,
    auto_group_threshold = 0.95,
)
print("\nNumber of features:", len(lofo_dset.feature_names))

In [None]:
# define the parameters
model_params = {
    'objective': "regression",
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1,
    'n_estimators': 2000,
}

# base model
lgbm_model = lgb.LGBMRegressor(**model_params)

In [None]:
num_folds = 5
groups_col = 'GameRulesetName'
gkf = GroupKFold(n_splits=num_folds)
split_list = list(gkf.split(df_train, groups=df_train[groups_col]))

In [None]:
lofo_importance = LOFOImportance(
    dataset = lofo_dset,
    scoring = 'neg_mean_squared_error',
    model = lgbm_model,
    fit_params = dict(categorical_feature=categorical_cols),
    cv = split_list,
    n_jobs = 1,   
)

In [None]:
importance_df = lofo_importance.get_importance()
importance_df.to_csv("lofo_lightgbm.csv", index=False, sep=";")
importance_df

In [None]:
# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 80))

In [None]:
plot_importance(importance_df, figsize=(12, 80), kind="box")

In [None]:
importance_df.sort_values(by="importance_mean", ascending=False).tail(20)


In [None]:
# features to drop
features_to_drop =(
    importance_df
    .sort_values(by="importance_mean", ascending=False)
    .tail(20)
    .feature
).to_list()

features_to_drop = [
    f.strip() for feat in features_to_drop for f in feat.split("&")
]
features_to_drop


***