<p style="background-color: #1B1212; font-size: 300%; text-align: center; border-radius: 40px 40px; color: #C9A9A6; font-weight: bold; font-family: 'Cinzel', serif; text-transform: uppercase; border: 4px solid #C9A9A6;">imports</p>

In [None]:
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import polars as pl
import pandas as pd
import plotly.graph_objects as go

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error as mse

<p style="background-color: #1B1212; font-size: 300%; text-align: center; border-radius: 40px 40px; color: #C9A9A6; font-weight: bold; font-family: 'Cinzel', serif; text-transform: uppercase; border: 4px solid #C9A9A6;">configuration class</p>

In [None]:
class CFG:
    
    train_path = Path('/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv')
    batch_size = 131072

    early_stop = 500
    n_features = 200
    n_splits = 5
    color = '#C9A9A6'
    
    lgb_p = {
        'objective': 'regression',
        'min_child_samples': 24,
        'num_iterations': 50000,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 0.8,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'device': 'cpu',
        'max_depth': 9,
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }
    
    ctb_p = {
        'loss_function': 'RMSE',
        'learning_rate': 0.03,
        'num_trees': 50000,
        'random_state': 42,
        'task_type': 'CPU',
        'reg_lambda': 0.8,
        'depth': 8
    }

<p style="background-color: #1B1212; font-size: 300%; text-align: center; border-radius: 40px 40px; color: #C9A9A6; font-weight: bold; font-family: 'Cinzel', serif; text-transform: uppercase; border: 4px solid #C9A9A6;">feature engineering class</p>

In [None]:
class FE:
    
    def __init__(self, batch_size):
        self.batch_size = batch_size
        
    def drop_cols(self, df, bad_cols=None): # bad_cols must be provided when processing the test data
        
        # Define redundant columns for model development
        cols = ['Id', 
                'LudRules', 
                'EnglishRules',
                'num_wins_agent1',
                'num_draws_agent1',
                'num_losses_agent1']
        
        df = df.drop([col for col in cols if col in df.columns])
        
        # Select and drop columns with 100% null values
        df = df.drop([col for col in df.columns if df.select(pl.col(col).null_count()).item() == df.height])
        
        # Select (if not provided) and drop columns with only one unique value
        bad_cols = [col for col in df.columns if df.select(pl.col(col).n_unique()).item() == 1] if bad_cols is None else bad_cols
        df = df.drop(bad_cols)
        
        return df, bad_cols
    
    def cast_datatypes(self, df):
        
        # Set datatype for categorical columns
        cat_cols = ['GameRulesetName', 'agent1', 'agent2']
        df = df.with_columns([pl.col(col).cast(pl.String) for col in cat_cols])   
        
        # Find numeric columns
        for col in df.columns:
            if col not in cat_cols:
            
                # Set datatype for a numeric column as per the datatype of the first non-null item
                val = df.select(pl.col(col).drop_nulls().first()).item()
                df = df.with_columns(pl.col(col).cast(pl.Int16) if isinstance(val, int) else pl.col(col).cast(pl.Float32))   
            
        return df    
    
    def info(self, df):
        
        print(f'Shape: {df.shape}')   
        mem = df.estimated_size() / 1024**2
        print('Memory usage: {:.2f} MB\n'.format(mem))
        
    def apply_fe(self, path):
        
        df = pl.read_csv(path, batch_size=self.batch_size)
        
        df, bad_cols = self.drop_cols(df)
        df = self.cast_datatypes(df)
        self.info(df)
        
        cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
        
        return df, bad_cols, cat_cols

In [None]:
fe = FE(CFG.batch_size)

<p style="background-color: #1B1212; font-size: 300%; text-align: center; border-radius: 40px 40px; color: #C9A9A6; font-weight: bold; font-family: 'Cinzel', serif; text-transform: uppercase; border: 4px solid #C9A9A6;">model development class</p>

In [None]:
class MD:
    
    def __init__(self, early_stop, n_features, n_splits, lgb_p, ctb_p, color):
        self.early_stop = early_stop
        self.n_features = n_features
        self.n_splits = n_splits
        self.lgb_p = lgb_p
        self.ctb_p = ctb_p
        self.color = color
        
    def _plot_cv(self, fold_scores, title):
        
        fold_scores = [round(score, 3) for score in fold_scores]
        mean_score = round(np.mean(fold_scores), 3)
        std_score = round(np.std(fold_scores), 3)

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'markers', 
            name = 'Fold Scores',
            marker = dict(size = 24, color=self.color, symbol='diamond'),
            text = [f'{score:.3f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
            hoverlabel=dict(font=dict(size=16))  
        ))

        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.3f}',
            line = dict(dash = 'dash', color = '#FFBF00'),
            hoverinfo = 'none'
        ))

        fig.update_layout(
            title = f'{title} | Cross-Validation RMSE Scores | Variation of CV scores: {mean_score} ± {std_score}',
            xaxis_title = 'Fold',
            yaxis_title = 'RMSE Score',
            plot_bgcolor = 'rgba(0,0,0,0)',
            paper_bgcolor = 'rgba(0,0,0,0)',
            xaxis = dict(
                gridcolor = 'lightgray',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5]
            ),
            yaxis = dict(gridcolor = 'lightgray')
        )

        fig.show() 
        
    def _train_model(self, data, cat_cols, title):

        for col in cat_cols:
            data[col] = data[col].astype('category')
        
        X = data.drop(['utility_agent1'], axis=1)
        y = data['utility_agent1']
        group = data['GameRulesetName']
        
        cv = GroupKFold(n_splits=self.n_splits)
        
        models, scores = [], []
        
        # Initialize out-of-fold predictions array
        oof_preds = np.zeros(len(X))
        
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, group)):
                
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            if title.startswith('LightGBM'):

                model = lgb.LGBMRegressor(**self.lgb_p)

                model.fit(X_train, y_train,
                          eval_set=[(X_valid, y_valid)],
                          eval_metric='rmse',
                          callbacks=[lgb.early_stopping(self.early_stop, verbose=0), lgb.log_evaluation(0)])
                
            elif title.startswith('CatBoost'):

                model = CatBoostRegressor(**self.ctb_p, verbose=0, cat_features=cat_cols)

                model.fit(X_train, y_train,
                          eval_set=(X_valid, y_valid),
                          early_stopping_rounds=self.early_stop, verbose=0)

            models.append(model)

            # Store out-of-fold predictions for this fold
            oof_preds[valid_index] = model.predict(X_valid)
            score = mse(y_valid, oof_preds[valid_index], squared=False)
            scores.append(score)
        
        self._plot_cv(scores, title)
        
        return models
    
    def feature_importance(self, data, cat_cols, title):
        
        models = self._train_model(data, cat_cols, title)
        
        feature_cols = [col for col in data.columns if col != 'utility_agent1']
        
        feature_importances = np.zeros(len(feature_cols))
        for model in models:
                
            if title.startswith('LightGBM'):
                feature_importances += model.feature_importances_ / len(models)
            
            elif title.startswith('CatBoost'):
                feature_importances += model.get_feature_importance() / len(models)
        
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': feature_importances
        })
        
        feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)
        
        drop_features = feature_importance.loc[self.n_features:, 'feature'].tolist()
                    
        return drop_features

In [None]:
md = MD(CFG.early_stop, CFG.n_features, CFG.n_splits, CFG.lgb_p, CFG.ctb_p, CFG.color)

<p style="background-color: #1B1212; font-size: 300%; text-align: center; border-radius: 40px 40px; color: #C9A9A6; font-weight: bold; font-family: 'Cinzel', serif; text-transform: uppercase; border: 4px solid #C9A9A6;">feature importance</p>

In [None]:
train, _, cat_cols = fe.apply_fe(CFG.train_path)
train = train.to_pandas()
display(train.head())

In [None]:
drop_lgb_features = md.feature_importance(train, cat_cols, 'LightGBM')
drop_ctb_features = md.feature_importance(train, cat_cols, 'CatBoost')

In [None]:
drop_features = list(set(drop_lgb_features) & set(drop_ctb_features))

In [None]:
importances = pd.DataFrame({
    'drop_features': drop_features
})

print(f'Shape: {importances.shape}')
importances.to_csv('importances.csv', index=False)