# **IMPORTS AND INSTALLATIONS**

In [None]:
%%capture

!pip install -q lightautoml --no-index --find-links=/kaggle/input/mcts2024-packages-v2/LAMA

In [None]:
%%writefile -a myimports.py

from warnings import filterwarnings
filterwarnings('ignore')
from gc import collect

from os import path, walk, getpid
from psutil import Process
import re
from collections import Counter
from itertools import product

import ctypes
libc = ctypes.CDLL("libc.so.6")

from IPython.display import display_html, clear_output
from pprint import pprint
from functools import partial
from copy import deepcopy
import pandas as pd, numpy as np, os, joblib
import polars as pl
import polars.selectors as cs
import re

from warnings import filterwarnings
filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from colorama import Fore, Style, init
from warnings import filterwarnings
filterwarnings('ignore')
from tqdm.notebook import tqdm


In [None]:
%%writefile -a myimports.py

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import (PredefinedSplit as PDS, KFold, GroupKFold as GKF)
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

print(f"---> CUDA available = {torch.cuda.is_available()}\n\n")

In [None]:
%%time

exec(open('myimports.py','r').read())

# **CONFIGURATION**

## **KEY NOTE**

This kernel currently trained with a CPU and uses a parameter - **test_req = True** <br>
This means that the data is sampled into a small subset (1% of total dataset grouped by grouper column) and the syntax is checked. <br>
For the actual run, please change this to **test_req = False** and if you use a GPU, then please change the **gpu_id** parameter. Currently, it is set to None, indicating that this is a **CPU** kernel

In [None]:
%%time

test_req     = True
ftre_imp_req = True

target     = 'utility_agent1'
group_col  = 'GameRulesetName'

model_id   = "LAMAV1_1"
op_path    = f"/kaggle/working"
ip_path    = f"/kaggle/input/um-game-playing-strength-of-mcts-variants"
verbosity  = 2
state      = 42

if test_req:
    sample_frac = 1000
    time_budget = 1000
    n_splits    = 5
    algos       = ["linear_l2", "cb", "lgb", "xgb"]
    batch_size  = 32
    gpu_id      = None

else:
    time_budget = 60 * 60 * 4.5
    n_splits    = 5
    algos       = ["linear_l2", "cb_tuned", "xgb_tuned", "lgb_tuned"]
    batch_size  = 64
    gpu_id      = "0"

In [None]:
%%writefile -a fe.py

class Utility:
    """
    This class serves to do the below-
    1. Define method to print in color
    2. Define the garbage cleaning process
    """;

    def PrintColor(self, text:str, color = Fore.BLUE, style = Style.BRIGHT):
        "Prints color outputs using colorama using a text F-string";
        print(style + color + text + Style.RESET_ALL)

    def ScoreMetric(self, ytrue, ypred)-> float:
        """
        This method calculates the metric for the competition
        Inputs- ytrue, ypred:- input truth and predictions
        Output- float:- competition metric
        """;
        return mse(ytrue, ypred, squared = False)

    def CleanMemory(self):
        "This method cleans the memory off unused objects and displays the cleaned state RAM usage";

        collect();
        libc.malloc_trim(0)
        pid        = getpid()
        py         = Process(pid)
        memory_use = py.memory_info()[0] / 2. ** 30
        return f"\nRAM usage = {memory_use :.4} GB"

Utils = Utility()
print();

# **PREPROCESSING**

In [None]:
%%writefile -a fe.py

class Preprocessor:
    "This class preprocesses the data and creates the cv folds also for the training data"
    
    def __init__(
        self, 
        n_splits = 5, 
        op_path: str = "/kaggle/working",  
    ):
        
        self.irrelevant_cols = \
        ['Id', 'Properties', 'Format', 'Time', 'Discrete', 'Realtime', 'Turns', 'Alternating', 
         'Simultaneous', 'HiddenInformation', 'Match', 'AsymmetricRules', 
         'AsymmetricPlayRules', 'AsymmetricEndRules', 'AsymmetricSetup',
         'Players', 'NumPlayers', 'Simulation', 'Solitaire', 'TwoPlayer',
         'Multiplayer', 'Coalition', 'Puzzle', 'DeductionPuzzle', 'PlanningPuzzle',
         'Equipment', 'Container', 'Board', 'PrismShape', 'ParallelogramShape', 
         'RectanglePyramidalShape', 'TargetShape', 'BrickTiling', 'CelticTiling', 
         'QuadHexTiling', 'Hints', 'PlayableSites', 'Component', 'DiceD3', 'BiasedDice', 
         'Card', 'Domino', 'Rules', 'SituationalTurnKo', 'SituationalSuperko', 'InitialAmount',
         'InitialPot', 'Play', 'BetDecision', 'BetDecisionFrequency', 'VoteDecisionFrequency', 
         'ChooseTrumpSuitDecision', 'ChooseTrumpSuitDecisionFrequency', 'LeapDecisionToFriend', 
         'LeapDecisionToFriendFrequency', 'HopDecisionEnemyToFriend', 'HopDecisionEnemyToFriendFrequency',
         'HopDecisionFriendToFriend', 'FromToDecisionWithinBoard', 'FromToDecisionBetweenContainers', 
         'BetEffect', 'BetEffectFrequency', 'VoteEffectFrequency', 'SwapPlayersEffectFrequency', 'TakeControl', 
         'TakeControlFrequency', 'PassEffectFrequency', 'SetCost', 'SetCostFrequency', 'SetPhase', 
         'SetPhaseFrequency', 'SetTrumpSuit', 'SetTrumpSuitFrequency', 'StepEffectFrequency', 
         'SlideEffectFrequency', 'LeapEffectFrequency', 'HopEffectFrequency', 'FromToEffectFrequency', 
         'SwapPiecesEffect', 'SwapPiecesEffectFrequency', 'ShootEffect', 'ShootEffectFrequency', 'MaxCapture',
         'OffDiagonalDirection', 'Information', 'HidePieceType', 'HidePieceOwner', 'HidePieceCount',
         'HidePieceRotation', 'HidePieceValue', 'HidePieceState', 'InvisiblePiece', 'End', 'LineDrawFrequency',
         'ConnectionDraw', 'ConnectionDrawFrequency', 'GroupLossFrequency', 'GroupDrawFrequency',
         'LoopLossFrequency', 'LoopDraw', 'LoopDrawFrequency', 'PatternLoss', 'PatternLossFrequency', 
         'PatternDraw', 'PatternDrawFrequency', 'PathExtentEndFrequency', 'PathExtentWinFrequency', 
         'PathExtentLossFrequency', 'PathExtentDraw', 'PathExtentDrawFrequency',
         'TerritoryLoss', 'TerritoryLossFrequency', 'TerritoryDraw', 'TerritoryDrawFrequency', 
         'CheckmateLoss', 'CheckmateLossFrequency', 'CheckmateDraw', 'CheckmateDrawFrequency',
         'NoTargetPieceLoss', 'NoTargetPieceLossFrequency', 'NoTargetPieceDraw', 
         'NoTargetPieceDrawFrequency', 'NoOwnPiecesDraw', 'NoOwnPiecesDrawFrequency',
         'FillLoss', 'FillLossFrequency', 'FillDraw', 'FillDrawFrequency', 'ScoringDrawFrequency',
         'NoProgressWin', 'NoProgressWinFrequency', 'NoProgressLoss', 'NoProgressLossFrequency', 
         'SolvedEnd', 'Behaviour', 'StateRepetition', 'PositionalRepetition', 'SituationalRepetition',
         'Duration', 'Complexity', 'BoardCoverage', 'GameOutcome', 'StateEvaluation', 'Clarity', 
         'Narrowness', 'Variance', 'Decisiveness', 'DecisivenessMoves', 'DecisivenessThreshold',
         'LeadChange', 'Stability', 'Drama', 'DramaAverage', 'DramaMedian', 'DramaMaximum', 'DramaMinimum', 
         'DramaVariance', 'DramaChangeAverage', 'DramaChangeSign', 'DramaChangeLineBestFit', 'DramaChangeNumTimes',
         'DramaMaxIncrease', 'DramaMaxDecrease', 'MoveEvaluation', 'MoveEvaluationAverage', 'MoveEvaluationMedian', 
         'MoveEvaluationMaximum', 'MoveEvaluationMinimum', 'MoveEvaluationVariance', 'MoveEvaluationChangeAverage', 
         'MoveEvaluationChangeSign', 'MoveEvaluationChangeLineBestFit', 'MoveEvaluationChangeNumTimes',
         'MoveEvaluationMaxIncrease', 'MoveEvaluationMaxDecrease', 'StateEvaluationDifference', 
         'StateEvaluationDifferenceAverage', 'StateEvaluationDifferenceMedian', 'StateEvaluationDifferenceMaximum',
         'StateEvaluationDifferenceMinimum', 'StateEvaluationDifferenceVariance', 'StateEvaluationDifferenceChangeAverage', 
         'StateEvaluationDifferenceChangeSign', 'StateEvaluationDifferenceChangeLineBestFit',
         'StateEvaluationDifferenceChangeNumTimes',
         'StateEvaluationDifferenceMaxIncrease', 'StateEvaluationDifferenceMaxDecrease', 
         'BoardSitesOccupied', 'BoardSitesOccupiedMinimum', 'BranchingFactor', 'BranchingFactorMinimum',
         'DecisionFactor', 'DecisionFactorMinimum', 'MoveDistance', 'MoveDistanceMinimum', 'PieceNumber', 
         'PieceNumberMinimum', 'ScoreDifference', 'ScoreDifferenceMinimum', 'ScoreDifferenceChangeNumTimes',
         'Roots', 'Cosine', 'Sine', 'Tangent', 'Exponential', 'Logarithm', 'ExclusiveDisjunction',
         'Float', 'HandComponent', 'SetHidden', 'SetInvisible', 'SetHiddenCount', 'SetHiddenRotation',
         'SetHiddenState', 'SetHiddenValue', 'SetHiddenWhat', 'SetHiddenWho'
        ]

        self.game_cols    = ['EnglishRules', 'LudRules']
        self.output_cols  = ['num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1']
        self.agent_cols   = ['agent1', 'agent2']
        self.dropped_cols = self.output_cols + self.irrelevant_cols + self.game_cols
        
        self.target       = "utility_agent1"
        self.group_col    = 'GameRulesetName'
        self.n_splits     = n_splits
        self.op_path      = op_path
        
    def pp_data(
        self, 
        df: pl.DataFrame, 
        split_agent_ftre: bool = True, 
        cat_cols: list = [], 
        num_cols: list = [],  
        label : str = "Train",
    ):
        "This method preprocesses the data provided and returns a pandas dataset for modelling"
        
        if label == "Train": 
            ygrp = df.select(pl.col(self.group_col)).to_numpy().flatten()
        else:
            pass
        
        df = df.drop(self.dropped_cols, strict = False)
        
        if split_agent_ftre:
            for col in self.agent_cols:
                df = \
                df.with_columns(pl.col(col).str.split(by = "-").\
                                list.\
                                to_struct(fields= lambda idx: f"{col}_{idx}")
                               ).\
                unnest(col).\
                drop(f"{col}_0")

        if label == "Train":        
            cat_cols = []
            for col in df.columns:
                if col[:6] in self.agent_cols:
                    cat_cols.append(col)

            num_cols = \
            list(
                set(df.columns).difference(set(cat_cols)).difference(set(["fold_nb", self.target]))
            )
        
        else:
            pass
        
        Utils.PrintColor(f"---> Shape = {df.shape} |  Memory usage = {df.estimated_size('mb'):.3f} Mb")

        df = \
        df.with_columns([pl.col(col).cast(pl.Categorical) for col in cat_cols])
        df = df.select(pl.all().shrink_dtype())
        df = df.to_pandas()
        
        if label == "Train": 
            Utils.PrintColor(f"---> Determining cv-folds")
            df["fold_nb"] = -1
            
            cv = GKF(self.n_splits)
            for fold_nb, (_, dev_idx) in enumerate(cv.split(df, df[self.target], groups = ygrp)):
                df.loc[dev_idx, "fold_nb"] = fold_nb

            df["fold_nb"] = df["fold_nb"].astype(np.int8)
            df.to_parquet(os.path.join(self.op_path, "XYtrain.parquet"))
            
        else:
            pass
            
        return df, cat_cols, num_cols        

In [None]:
%%time

exec(open('fe.py','r').read())

train  = pl.read_csv(os.path.join(ip_path, f"train.csv"))
test   = pl.read_csv(os.path.join(ip_path, f"test.csv"))
sub_fl = pd.read_csv(os.path.join(ip_path, "sample_submission.csv"), index_col = "Id")

Utils.PrintColor(f"---> Shapes = {train.shape} | {test.shape}")

# Preprocessing data
pp = Preprocessor(op_path = op_path)
train, cat_cols, num_cols = pp.pp_data(train, label = "Train")
train = train.drop("fold_nb", axis=1, errors = "ignore") 

print()
test, _, _ = pp.pp_data(test, label = "Test", cat_cols = cat_cols, num_cols = num_cols)

joblib.dump(cat_cols, os.path.join(op_path, "CatCols.joblib"))
joblib.dump(num_cols, os.path.join(op_path, "NumCols.joblib"))

if test_req:
    train = train.groupby(group_col).sample(frac = 0.01)
    Utils.PrintColor(f"---> Shapes = {train.shape} | {test.shape} - syntax check", 
                     color = Fore.CYAN
                     )
else:
    Utils.PrintColor(f"---> Shapes = {train.shape} | {test.shape} - final", 
                     color = Fore.CYAN
                     )  
 
train.index = range(len(train))
Utils.PrintColor(Utils.CleanMemory())

# **MODEL TRAINING**

**KEY NOTE**

We use the internal Group-KFold CV scheme offered by LightAutoML as below-
- we use group parameter in the roles dictionary to specify the grouper column
- we specify the cv splits in the reader params dict

Since this is a regression problem, the system develops a grouped k-fold cv scheme automatically using this setup

In [None]:
%%time 

task = Task('reg', loss = 'mse', metric = 'mse')

model = \
TabularAutoML(
    task = task,
    timeout = time_budget,
    cpu_limit = 16,
    gpu_ids = gpu_id,
    general_params = {"use_algos": [algos]},
    reader_params = {'n_jobs'        : 16, 
                     'n_splits'      : n_splits, 
                     'random_state'  : state, 
                     'advanced_roles': True
                     }
)

oof_preds = \
model.fit_predict(
    train,
    roles   = {'target': target, 
               "group" : group_col,
               },
    verbose = verbosity
    ).data.flatten()

preds = model.predict(test).data.flatten()

Utils.PrintColor(Utils.CleanMemory())
print();

In [None]:
%time 

if ftre_imp_req:
    try:
        ftreimp = \
        model.get_feature_scores('fast').\
        set_index("Feature").\
        sort_values("Importance", ascending = False)

        display(
            ftreimp.\
            head(25).\
            style.\
            format(formatter = '{:,.2f}').\
            bar(color = "#b3ecff").\
            set_caption(
                "Feature Importances - top 25 features"
            )
            )
        print()

        ftreimp.to_csv(
            os.path.join(op_path, f"FtreImp_{model_id}.csv")
        )
    except:
        pass

# **CLOSURE**

In [None]:
%%time

try:
    score = \
    Utils.ScoreMetric(train[target].values, oof_preds)
    Utils.PrintColor(f"\n---> OOF score = {score:.6f}", color = Fore.RED)
except:
    pass

sub_fl[target] = preds[0: len(sub_fl)]

Utils.PrintColor(f"\n\n---> Final submission file\n\n")
display(sub_fl.head(10).style.format(precision = 4))

pd.DataFrame(oof_preds,
             index = range(len(oof_preds)),
             columns = [model_id]
             ).\
             to_parquet(os.path.join(op_path, f"OOF_Preds_{model_id}.parquet"))

joblib.dump(
    model, os.path.join(op_path, f"FittedModels_{model_id}.joblib")
)

print()
!ls

Utils.PrintColor(Utils.CleanMemory());

# **LIGHTAUTOML REFERENCES**

- https://lightautoml.readthedocs.io/en/latest/ <br>
- https://github.com/sb-ai-lab/LightAutoML <br>
- https://lightautoml.readthedocs.io/en/latest/pages/tutorials/Tutorial_1_basics.html <br>
- https://www.kaggle.com/code/alexryzhkov/lightautoml-competition-baseline <br>
- https://www.kaggle.com/code/alexryzhkov/amex-lightautoml-starter <br>