# Ensemble

In [1]:
#necessary
import os
import sys
from pathlib import Path
import polars as pl#和pandas类似,但是处理大型数据集有更好的性能.
import pandas as pd#导入csv文件的库
import numpy as np#对矩阵进行科学计算的库
#kfold
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from sklearn.metrics import mean_squared_error as mse
#model
import lightgbm as lgb
from  lightgbm import LGBMRegressor, log_evaluation, early_stopping
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import dill#对对象进行序列化和反序列化(例如保存和加载树模型)
from sklearn.feature_extraction.text import TfidfVectorizer#将文本数据转换为tfidf特征
import re#用于正则表达式提取的库
import gc#垃圾回收的库
import matplotlib.pyplot as plt#一个强大的绘图库
import plotly.graph_objects as go
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

import random#提供了一些用于生成随机数的函数
sys.path.append("/kaggle/input/um-game-playing-strength-of-mcts-variants")
import kaggle_evaluation.mcts_inference_server#MCTS比赛专用的API

pd.options.display.max_rows = None
pd.options.display.max_columns = None

class APP:
    small_iterations = True
    short_dataset = True
    test_full_dataset = False
    local = os.environ.get("DOCKER_USING", "") == "LOCAL"
    submit = os.environ.get('KAGGLE_IS_COMPETITION_RERUN', "") != ""
    path_root = Path('/kaggle/input')
    input_path = path_root / 'um-game-playing-strength-of-mcts-variants'
    train_file = input_path / 'train.csv'
    test_file = input_path / ('test_full.csv' if test_full_dataset else 'test.csv')
    sample_subm_file = input_path / ('sample_subm_full.csv' if test_full_dataset else 'sample_submission.csv')
    if submit:
        small_iterations = False
        short_dataset = False
        test_full_dataset = False

#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(seed=2024)

# Model 1: [yunsuxiaozi - mcts-starter v01 0.427](https://www.kaggle.com/code/yunsuxiaozi/mcts-starter?scriptVersionId=201806533)

In [2]:
class model_1:
    train=pl.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv")
    train=train.to_pandas()
    print(f"len(train):{len(train)}")
    test=pl.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv")
    test=test.to_pandas()
    print(f"len(test):{len(test)}")
    test.head()

    class Preprocessor():
        def __init__(self,seed=2024,target='utility_agent1',train=None,num_folds=10,CV_LB_path="/kaggle/input/mcts-eda-about-cv-and-lb/1018CV_LB.csv"):
            self.seed=seed
            self.target=target
            self.train=train
            self.model_paths=[]#训练和推理的模型
            self.tfidf_paths=[]#字符串的tfidf模型
            self.num_folds=num_folds
            #检查CV和LB情况的CV,LB统计表
            #self.check=pd.read_csv(CV_LB_path)
            
        #清理df的字符串的列
        def clean(self,df,col):
            #字符串缺失值填充
            df[col]=df[col].fillna("nan")
            #字符串转换成小写
            df[col]=df[col].apply(lambda x:x.lower())
            #考虑到这种字符串 ‘MCTS-UCB1-0.6-NST-false‘
            ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
            for p in ps:
                df[col]=df[col].apply(lambda x:x.replace(p,' '))
            return df
        
        #文本的自动可读性指数 旨在衡量文本的可理解性.输出是理解课文所需的美国年级水平的近似表示.
        #https://www.nhooo.com/note/qa0tpe.html
        #初步理解:相同词数的情况下,句子越少,说明句子相对来说会很长,越长越不容易理解.words/sentence就会越大.
        #字符数相同的情况下,词数越多,单词越短,短的单词可能简单,所以就好理解.characters/words变小.
        #数值小就好理解,数值大就不好理解.具体的公式可能用数据做过实验得出?
        def ARI(self,txt):
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
            sentence=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
            ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
            return ari_score
        """
        http://www.supermagnus.com/mac/Word_Counter/index.html
        McAlpine EFLAW© Test
            (W + SW) / S
        McAlpine EFLAW© Readability
            Scale:
            1-20: Easy
            21-25: Quite Easy
            26-29: Mildly Difficult
            ≥ 30: Very Confusing
            S:total sentences
            W:total words
        """
        def McAlpine_EFLAW(self,txt):
            W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
            S=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
            mcalpine_eflaw_score=(W+S*W)/S
            return mcalpine_eflaw_score
        """
        https://readable.com/readability/coleman-liau-readability-index/

        =0.0588*L-0.296*S-15.8
        L是每100个单词有多少个字母,S是平均每100个单词有多少句子.
        """
        def CLRI(self,txt):
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))#空格,换行符,句号,问号,感叹号,逗号分开.
            sentence=len(re.split('\\.|\\?|\\!',txt))#句号,问号,感叹号分开的句子.
            L=100*characters/words
            S=100*sentence/words
            clri_score=0.0588*L-0.296*S-15.8
            return clri_score
            
        #保存训练好的树模型,obj是保存的模型,path是需要保存的路径
        def pickle_dump(self,obj, path):
            #打开指定的路径path,binary write(二进制写入)
            with open(path, mode="wb") as f:
                #将obj对象保存到f,使用协议版本4进行序列化
                dill.dump(obj, f, protocol=4)
        def pickle_load(self,path):
            #打开指定的路径path,binary read(二进制读取)
            with open(path, mode="rb") as f:
                #按照制定路径去加载模型
                data = dill.load(f)
                return data
        
        #遍历表格df的所有列修改数据类型减少内存使用
        def reduce_mem_usage(self,df, float16_as32=True):
            #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
            start_mem = df.memory_usage().sum() / 1024**2
            print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

            for col in df.columns:#遍历每列的列名
                col_type = df[col].dtype#列名的type
                if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
                    c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
                    if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
                        #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
                        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                            df[col] = df[col].astype(np.int8)
                        #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
                        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                            df[col] = df[col].astype(np.int16)
                        #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
                        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                            df[col] = df[col].astype(np.int32)
                        #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
                        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                            df[col] = df[col].astype(np.int64)  
                    else:#如果是浮点数类型.
                        #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
                        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                            if float16_as32:#如果数据需要更高的精度可以选择float32
                                df[col] = df[col].astype(np.float32)
                            else:
                                df[col] = df[col].astype(np.float16)  
                        #如果数值在float32的取值范围内，对它进行类型转换
                        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df[col] = df[col].astype(np.float32)
                        #如果数值在float64的取值范围内，对它进行类型转换
                        else:
                            df[col] = df[col].astype(np.float64)
            #计算一下结束后的内存
            end_mem = df.memory_usage().sum() / 1024**2
            print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
            #相比一开始的内存减少了百分之多少
            print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

            return df
            
        def FE(self,df,mode='train'):
            print(f"FE:{mode}")

            print("agent position feature")
            #一个agent是在正方还是反方
            total_agent=['MCTS-ProgressiveHistory-0.1-MAST-false', 'MCTS-ProgressiveHistory-0.1-MAST-true', 'MCTS-ProgressiveHistory-0.1-NST-false', 'MCTS-ProgressiveHistory-0.1-NST-true', 'MCTS-ProgressiveHistory-0.1-Random200-false', 'MCTS-ProgressiveHistory-0.1-Random200-true', 'MCTS-ProgressiveHistory-0.6-MAST-false', 'MCTS-ProgressiveHistory-0.6-MAST-true', 'MCTS-ProgressiveHistory-0.6-NST-false', 'MCTS-ProgressiveHistory-0.6-NST-true', 'MCTS-ProgressiveHistory-0.6-Random200-false', 'MCTS-ProgressiveHistory-0.6-Random200-true', 'MCTS-ProgressiveHistory-1.41421356237-MAST-false', 'MCTS-ProgressiveHistory-1.41421356237-MAST-true', 'MCTS-ProgressiveHistory-1.41421356237-NST-false', 'MCTS-ProgressiveHistory-1.41421356237-NST-true', 'MCTS-ProgressiveHistory-1.41421356237-Random200-false', 'MCTS-ProgressiveHistory-1.41421356237-Random200-true', 'MCTS-UCB1-0.1-MAST-false', 'MCTS-UCB1-0.1-MAST-true', 'MCTS-UCB1-0.1-NST-false', 'MCTS-UCB1-0.1-NST-true', 'MCTS-UCB1-0.1-Random200-false', 'MCTS-UCB1-0.1-Random200-true', 'MCTS-UCB1-0.6-MAST-false', 'MCTS-UCB1-0.6-MAST-true', 'MCTS-UCB1-0.6-NST-false', 'MCTS-UCB1-0.6-NST-true', 'MCTS-UCB1-0.6-Random200-false', 'MCTS-UCB1-0.6-Random200-true', 'MCTS-UCB1-1.41421356237-MAST-false', 'MCTS-UCB1-1.41421356237-MAST-true', 'MCTS-UCB1-1.41421356237-NST-false', 'MCTS-UCB1-1.41421356237-NST-true', 'MCTS-UCB1-1.41421356237-Random200-false', 'MCTS-UCB1-1.41421356237-Random200-true', 'MCTS-UCB1GRAVE-0.1-MAST-false', 'MCTS-UCB1GRAVE-0.1-MAST-true', 'MCTS-UCB1GRAVE-0.1-NST-false', 'MCTS-UCB1GRAVE-0.1-NST-true', 'MCTS-UCB1GRAVE-0.1-Random200-false', 'MCTS-UCB1GRAVE-0.1-Random200-true', 'MCTS-UCB1GRAVE-0.6-MAST-false', 'MCTS-UCB1GRAVE-0.6-MAST-true', 'MCTS-UCB1GRAVE-0.6-NST-false', 'MCTS-UCB1GRAVE-0.6-NST-true', 'MCTS-UCB1GRAVE-0.6-Random200-false', 'MCTS-UCB1GRAVE-0.6-Random200-true', 'MCTS-UCB1GRAVE-1.41421356237-MAST-false', 'MCTS-UCB1GRAVE-1.41421356237-MAST-true', 'MCTS-UCB1GRAVE-1.41421356237-NST-false', 'MCTS-UCB1GRAVE-1.41421356237-NST-true', 'MCTS-UCB1GRAVE-1.41421356237-Random200-false', 'MCTS-UCB1GRAVE-1.41421356237-Random200-true', 'MCTS-UCB1Tuned-0.1-MAST-false', 'MCTS-UCB1Tuned-0.1-MAST-true', 'MCTS-UCB1Tuned-0.1-NST-false', 'MCTS-UCB1Tuned-0.1-NST-true', 'MCTS-UCB1Tuned-0.1-Random200-false', 'MCTS-UCB1Tuned-0.1-Random200-true', 'MCTS-UCB1Tuned-0.6-MAST-false', 'MCTS-UCB1Tuned-0.6-MAST-true', 'MCTS-UCB1Tuned-0.6-NST-false', 'MCTS-UCB1Tuned-0.6-NST-true', 'MCTS-UCB1Tuned-0.6-Random200-false', 'MCTS-UCB1Tuned-0.6-Random200-true', 'MCTS-UCB1Tuned-1.41421356237-MAST-false', 'MCTS-UCB1Tuned-1.41421356237-MAST-true', 'MCTS-UCB1Tuned-1.41421356237-NST-false', 'MCTS-UCB1Tuned-1.41421356237-NST-true', 'MCTS-UCB1Tuned-1.41421356237-Random200-false', 'MCTS-UCB1Tuned-1.41421356237-Random200-true']
            agent1,agent2=df['agent1'].values,df['agent2'].values
            for i in range(len(total_agent)):
                value=np.zeros(len(df))
                for j in range(len(df)):#第i个agent在第j个数据中的状态
                    if agent1[j]==total_agent[i]:#如果第j个数据的agent1是agent[i]
                        value[j]+=1
                    elif agent2[j]==total_agent[i]:#如果第j个数据的agent1是agent[i]
                        value[j]-=1
                df[f'agent_{total_agent[i]}']=value

            
            df['area']=df['NumRows']*df['NumColumns']
            df['row_equal_col']=(df['NumColumns']==df['NumRows']).astype(np.int8)
            df['Playouts/Moves'] = df['PlayoutsPerSecond'] / (df['MovesPerSecond'] + 1e-15)
            df['EfficiencyPerPlayout'] = df['MovesPerSecond'] / (df['PlayoutsPerSecond'] + 1e-15)
            df['TurnsDurationEfficiency'] = df['DurationActions'] / (df['DurationTurnsStdDev'] + 1e-15)
            df['AdvantageBalanceRatio'] = df['AdvantageP1'] / (df['Balance'] + 1e-15)
            df['ActionTimeEfficiency'] = df['DurationActions'] / (df['MovesPerSecond'] + 1e-15)
            df['StandardizedTurnsEfficiency'] = df['DurationTurnsStdDev'] / (df['DurationActions'] + 1e-15)
            df['AdvantageTimeImpact'] = df['AdvantageP1'] / (df['DurationActions'] + 1e-15)
            df['DurationToComplexityRatio'] = df['DurationActions'] / (df['StateTreeComplexity'] + 1e-15)
            df['NormalizedGameTreeComplexity'] =  df['GameTreeComplexity'] /  (df['StateTreeComplexity'] + 1e-15)
            df['ComplexityBalanceInteraction'] =  df['Balance'] *  df['GameTreeComplexity']
            df['OverallComplexity'] =  df['StateTreeComplexity'] +  df['GameTreeComplexity']
            df['ComplexityPerPlayout'] =  df['GameTreeComplexity'] /  (df['PlayoutsPerSecond'] + 1e-15)
            df['TurnsNotTimeouts/Moves'] = df['DurationTurnsNotTimeouts'] / (df['MovesPerSecond'] + 1e-15)
            df['Timeouts/DurationActions'] = df['Timeouts'] / (df['DurationActions'] + 1e-15)
            df['OutcomeUniformity/AdvantageP1'] = df['OutcomeUniformity'] / (df['AdvantageP1'] + 1e-15)
            df['ComplexDecisionRatio'] = df['StepDecisionToEnemy'] + df['SlideDecisionToEnemy'] + df['HopDecisionMoreThanOne']
            df['AggressiveActionsRatio'] = df['StepDecisionToEnemy'] + df['HopDecisionEnemyToEnemy'] + df['HopDecisionFriendToEnemy'] + df['SlideDecisionToEnemy']
            # THE MOST IMPORTANT FEATURES ARE FROM BELOW
            #new 3 features
            df['ActionToComplexityRatio'] = df['DurationActions'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
            df['PlayoutToAdvantageRatio'] = df['PlayoutsPerSecond'] / (df['AdvantageP1'] + 1e-15)
            df['BalancedEfficiency'] = (df['Balance'] * df['DurationActions']) / (df['GameTreeComplexity'] + 1e-15)
            #new 2 features
            df['ComplexityToMovesRatio'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['MovesPerSecond'] + 1e-15)
            df['BalancePerAction'] = df['Balance'] / (df['DurationActions'] + df['MovesPerSecond'] + 1e-15)
            #new 2 features
            df['WeightedActionEfficiency'] = df['DurationActions'] / ((df['GameTreeComplexity'] + df['StateTreeComplexity']) * (df['DurationTurns'] + 1e-15))
            df['NormalizedPlayoutsEfficiency'] = df['PlayoutsPerSecond'] / ((df['AdvantageP1'] * df['Balance']) + df['DurationActions'] + 1e-15)

            df['EfficiencyWeightedComplexity'] = (df['DurationActions'] * df['MovesPerSecond']) / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
            df['BalancedComplexityAdvantage'] = (df['Balance'] * df['AdvantageP1']) / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)

            # THE BAD EATURES ARE FROM BELOW

#####################################################################################################
#df['ActionEfficiencyPerArea'] = (df['DurationActions'] / df['MovesPerSecond']) / (df['area'] + 1e-15)
#df['AdvantageToComplexity'] = df['AdvantageP1'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
                        #df['AdvantageWeightedEfficiency'] = (df['AdvantageP1'] * df['MovesPerSecond']) / (df['OverallComplexity'] + 1e-15)
            #df['TimeoutToEfficiencyRatio'] = df['Timeouts'] / (df['MovesPerSecond'] + df['PlayoutsPerSecond'] + 1e-15)
            #df['DynamicBalanceEfficiency'] = (df['Balance'] * df['DurationTurns']) / (df['MovesPerSecond'] + 1e-15)
            #df['AdvantagePerMoveEfficiency'] = df['AdvantageP1'] / (df['MovesPerSecond'] + 1e-15)
            #df['StandardizedActionBalance'] = (df['Balance'] * df['DurationActions']) / (df['OverallComplexity'] + df['MovesPerSecond'] + 1e-15)
            #df['AdjustedGameEfficiency'] = (df['PlayoutsPerSecond'] * df['DurationActions']) / (df['GameTreeComplexity'] + df['AdvantageP1'] + 1e-15)
            #df['TimeoutPenaltyEfficiency'] = df['Timeouts'] / (df['MovesPerSecond'] + df['PlayoutsPerSecond'] + 1e-15)
            #df['OutcomeToComplexityRatio'] = df['OutcomeUniformity'] / (df['OverallComplexity'] + 1e-15)

            #df['StandardizedActionBalance'] = (df['Balance'] * df['DurationActions']) / (df['OverallComplexity'] + df['MovesPerSecond'] + 1e-15)
            #df['AdjustedGameEfficiency'] = (df['PlayoutsPerSecond'] * df['DurationActions']) / (df['GameTreeComplexity'] + df['AdvantageP1'] + 1e-15)

            #df['BalanceImpactOnPlayouts'] = (df['Balance'] * df['PlayoutsPerSecond']) / (df['AdvantageP1'] + 1e-15)
            #df['ActionEfficiencyPerTurn'] = (df['DurationActions'] / (df['DurationTurns'] + 1e-15))

            #df['ComplexityBalanceImpact'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['Balance'] + 1e-15)
            #df['TimeWeightedComplexity'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) * df['DurationTurns']
            #df['EfficiencyToAdvantage'] = (df['DurationActions'] * df['PlayoutsPerSecond']) / (df['AdvantageP1'] + 1e-15)
            #df['AdvantageWeightedBalance'] = df['AdvantageP1'] * df['Balance'] / (df['PlayoutsPerSecond'] + 1e-15)
            #df['TurnWeightedComplexity'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['DurationTurns'] + 1e-15)
            #df['ComplexityBalancedImpact'] = df['Balance'] * (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['DurationActions'] + 1e-15)
            #df['BalanceComplexityRatio'] = df['Balance'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
            #df['PlayoutsToComplexityRatio'] = df['PlayoutsPerSecond'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)

            #df['EfficiencyWeightedAdvantage'] = (df['PlayoutsPerSecond'] * df['DurationActions']) / (df['AdvantageP1'] + df['GameTreeComplexity'] + 1e-15)
            #df['PlayoutsToComplexityRatio'] = df['PlayoutsPerSecond'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)

            #df['StabilityPlayoutEfficiency'] = (df['Balance'] * df['PlayoutsPerSecond']) / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
            #df['NormalizedAdvantageToAction'] = df['AdvantageP1'] / (df['DurationActions'] + df['MovesPerSecond'] + 1e-15)

            #df['OverallEfficiency'] = (df['DurationActions'] / df['MovesPerSecond']) * (df['PlayoutsPerSecond'] / (df['GameTreeComplexity'] + 1e-15))
            #df['ComplexityToMovesRatio'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['MovesPerSecond'] + 1e-15)
            #df['BalancePerAction'] = df['Balance'] / (df['DurationActions'] + df['MovesPerSecond'] + 1e-15)
            
            #df['DensityBalanceImpact'] = df['Balance'] / (df['area'] + 1e-15)
            #df['AdvantageToDensity'] = df['AdvantageP1'] / (df['area'] + 1e-15)

            #df['InteractionImpact'] = df['StepDecisionToEnemy'] / (df['OverallComplexity'] + 1e-15)
            #df['StabilityFactor'] = df['OutcomeUniformity'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
            #df['PlayoutsBalanceEfficiency'] = df['PlayoutsPerSecond'] * df['Balance'] / (df['GameTreeComplexity'] + 1e-15)

            #df['EfficiencyVariance'] = df['DurationTurnsStdDev'] / (df['MovesPerSecond'] + 1e-15)
            #df['TimeoutWeightedComplexity'] = df['Timeouts'] / (df['OverallComplexity'] + 1e-15)
            #df['BalancedAggressionEfficiency'] = df['AggressiveActionsRatio'] * df['Balance'] / (df['OutcomeUniformity'] + 1e-15)

#bad features' archiv        
#df['ControlEfficiency'] = (df['Balance'] * df['AdvantageP1']) / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
#df['BalanceToActionRatio'] = df['Balance'] / (df['DurationActions'] + 1e-15)
#df['AggressionToAdvantageRatio'] = df['InteractionIndex'] / (df['AdvantageP1'] + 1e-15)
#df['DynamicPlayoutEfficiency'] = df['PlayoutsPerSecond'] / (df['DurationTurns'] * (df['GameTreeComplexity'] + 1e-15))
#df['PlayoutToTreeRatio'] = df['PlayoutsPerSecond'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
#df['CumulativeControl'] = (df['Balance'] * df['DurationTurns']) / (df['GameTreeComplexity'] + 1e-15)
#df['TimeToActionEfficiency'] = df['DurationActions'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + df['DurationTurns'] + 1e-15)
#df['PlayoutToTreeRatio'] = df['PlayoutsPerSecond'] / (df['GameTreeComplexity'] + df['StateTreeComplexity'] + 1e-15)
#df['NormalizedAdvantage'] = df['AdvantageP1'] / (df['DurationActions'] + 1e-15) 
#df['EfficiencyToUniformity'] = (df['DurationActions'] / (df['MovesPerSecond'] + 1e-15)) / (df['OutcomeUniformity'] + 1e-15)
#df['ComplexityAdvantageEfficiency'] = df['AdvantageP1'] / (df['OverallComplexity'] + 1e-15)
#df['TimeoutToActionRatio'] = df['Timeouts'] / (df['DurationActions'] + df['MovesPerSecond'] + 1e-15)
#df['ComplexityTurnRatio'] = (df['StateTreeComplexity'] + df['GameTreeComplexity']) / (df['DurationTurns'] + 1e-15)
#df['AggressionUniformityImpact'] = df['AggressiveActionsRatio'] / (df['OutcomeUniformity'] + 1e-15)
#df['AdjustedPlayoutAdvantage'] = df['PlayoutsPerSecond'] / (df['AdvantageP1'] * df['Balance'] + 1e-15)
#df['ComplexActionDensity'] = (df['GameTreeComplexity'] + df['StateTreeComplexity']) / (df['DurationActions'] + 1e-15)
#df['RelativeTimeoutImpact'] = df['Timeouts'] / (df['GameTreeComplexity'] + df['DurationActions'] + 1e-15)


            print("deal with outliers")
            #df['PlayoutsPerSecond']=df['PlayoutsPerSecond'].clip(0,25000)
            #df['MovesPerSecond']=df['MovesPerSecond'].clip(0,1000000)
            #df['PlayoutsPerSecond'] = np.log1p(df['PlayoutsPerSecond'])
            #df['MovesPerSecond'] = np.log1p(df['MovesPerSecond'])
            df['PlayoutsPerSecond'] = (df['PlayoutsPerSecond'] - df['PlayoutsPerSecond'].mean()) / df['PlayoutsPerSecond'].std()
            df['MovesPerSecond'] = (df['MovesPerSecond'] - df['MovesPerSecond'].mean()) / df['MovesPerSecond'].std()
            
            print("agent1 agent2 feature")
            cols=['selection','exploration_const','playout','score_bounds']
            for i in range(len(cols)):
                for j in range(2):
                    df[f'{cols[i]}{j+1}']=df[f'agent{j+1}'].apply(lambda x:x.split('-')[i+1])
            

            print(f"one_hot_encoder")
            #训练集上nunique 大于2小于10的cols
            onehot_cols=[['NumOffDiagonalDirections', [0.0, 4.82, 2.0, 5.18, 3.08, 0.06]], ['NumLayers', [1, 0, 4, 5]], ['NumPhasesBoard', [3, 2, 1, 5, 4]], ['NumContainers', [1, 4, 3, 2]], ['NumDice', [0, 2, 1, 4, 6, 3, 5, 7]], ['ProposeDecisionFrequency', [0.0, 0.05, 0.01]], ['PromotionDecisionFrequency', [0.0, 0.01, 0.03, 0.02, 0.11, 0.05, 0.04]], ['SlideDecisionToFriendFrequency', [0.0, 0.19, 0.06]], ['LeapDecisionToEnemyFrequency', [0.0, 0.04, 0.01, 0.02, 0.07, 0.03, 0.14, 0.08]], ['HopDecisionFriendToFriendFrequency', [0.0, 0.13, 0.09]], ['HopDecisionEnemyToEnemyFrequency', [0.0, 0.01, 0.2, 0.03]], ['HopDecisionFriendToEnemyFrequency', [0.0, 0.01, 0.09, 0.25, 0.02]], ['FromToDecisionFrequency', [0.0, 0.38, 1.0, 0.31, 0.94, 0.67]], ['ProposeEffectFrequency', [0.0, 0.01, 0.03]], ['PushEffectFrequency', [0.0, 0.5, 0.96, 0.25]], ['FlipFrequency', [0.0, 0.87, 1.0, 0.96]], ['SetCountFrequency', [0.0, 0.62, 0.54, 0.02]], ['DirectionCaptureFrequency', [0.0, 0.55, 0.54]], ['EncloseCaptureFrequency', [0.0, 0.08, 0.1, 0.07, 0.12, 0.02, 0.09]], ['InterveneCaptureFrequency', [0.0, 0.01, 0.14, 0.04]], ['SurroundCaptureFrequency', [0.0, 0.01, 0.03, 0.02]], ['NumPlayPhase', [1, 2, 3, 4, 5, 6, 7, 8]], ['LineLossFrequency', [0.0, 0.96, 0.87, 0.46, 0.26, 0.88, 0.94]], ['ConnectionEndFrequency', [0.0, 0.19, 1.0, 0.23, 0.94, 0.35, 0.97]], ['ConnectionLossFrequency', [0.0, 0.54, 0.78]], ['GroupEndFrequency', [0.0, 1.0, 0.11, 0.79]], ['GroupWinFrequency', [0.0, 0.11, 1.0]], ['LoopEndFrequency', [0.0, 0.14, 0.66]], ['LoopWinFrequency', [0.0, 0.14, 0.66]], ['PatternEndFrequency', [0.0, 0.63, 0.35]], ['PatternWinFrequency', [0.0, 0.63, 0.35]], ['NoTargetPieceWinFrequency', [0.0, 0.72, 0.77, 0.95, 0.32, 1.0]], ['EliminatePiecesLossFrequency', [0.0, 0.85, 0.96, 0.68]], ['EliminatePiecesDrawFrequency', [0.0, 0.03, 0.91, 1.0, 0.36, 0.86]], ['NoOwnPiecesLossFrequency', [0.0, 1.0, 0.68]], ['FillEndFrequency', [0.0, 1.0, 0.04, 0.01, 0.99, 0.72]], ['FillWinFrequency', [0.0, 1.0, 0.04, 0.01, 0.99]], ['ReachDrawFrequency', [0.0, 0.9, 0.98]], ['ScoringLossFrequency', [0.0, 0.6, 0.62]], ['NoMovesLossFrequency', [0.0, 1.0, 0.13, 0.06]], ['NoMovesDrawFrequency', [0.0, 0.01, 0.04, 0.03, 0.22]], ['BoardSitesOccupiedChangeNumTimes', [0.0, 0.06, 0.42, 0.12, 0.14, 0.94]], ['BranchingFactorChangeNumTimesn', [0.0, 0.3, 0.02, 0.07, 0.04, 0.13, 0.01, 0.21, 0.03]], ['PieceNumberChangeNumTimes', [0.0, 0.06, 0.42, 0.12, 0.14, 1.0]], ['selection1', ['ProgressiveHistory', 'UCB1', 'UCB1GRAVE', 'UCB1Tuned']], ['selection2', ['ProgressiveHistory', 'UCB1GRAVE', 'UCB1', 'UCB1Tuned']], ['exploration_const1', ['0.1', '0.6', '1.41421356237']], ['exploration_const2', ['0.6', '0.1', '1.41421356237']], ['playout1', ['MAST', 'NST', 'Random200']], ['playout2', ['Random200', 'NST', 'MAST']]]
            for col,unique in onehot_cols:
                for u in unique:
                    df[f'{col}_{u}']=(df[col]==u).astype(np.int8)
                    
                    
            print("deal with LudRules") 
            print("1:drop game")
            #注意到LudRules列开头都是'(game "'
            #双引号到下一个双引号之间是游戏的名称(有的游戏名称和‘GameRulesetName’列不一样)
            #考虑到测试数据有不同的游戏,游戏名称做特征CV会不准,故drop掉游戏名称
            #即:rule去掉'(game "',然后找第一个双引号,并用双引号后面的内容在前面加上'('作为新的rule.
            def drop_gamename(rule):
                rule=rule[len('(game "'):]
                for i in range(len(rule)):
                    if rule[i]=='"':
                        return rule[i+1:]
            df['LudRules']=df['LudRules'].apply(lambda x:drop_gamename(x))

            print("2:player")
            def get_player(rule):
                player=''
                stack=[]#栈匹配 () {},这里假设标点都是匹配的
                for i in range(len(rule)):
                    player+=rule[i]
                    if rule[i] in ['(','{']:
                        stack.append(rule[i])
                    elif rule[i] in [')','}']:
                        stack=stack[:-1]
                        if len(stack)==0:#第一次栈空就出来
                            return player
            df['player']=df['LudRules'].apply(lambda rule:get_player(rule))
            df=self.clean(df,'player')
            df['player_len']=df['player'].apply(len)
            df['LudRules']=[rule[len(player):] for player,rule in zip(df['player'],df['LudRules'])]
            df.drop(['player'],axis=1,inplace=True)
            
            print("Rules readable")
            for rule in ['EnglishRules', 'LudRules']:
                df[rule+"_ARI"]=df[rule].apply(lambda x:self.ARI(x))
                df[rule+"CLRI"]=df[rule].apply(lambda x:self.CLRI(x))
                df[rule+"McAlpine_EFLAW"]=df[rule].apply(lambda x:self.McAlpine_EFLAW(x))
                    
            df['PlayoutsPerSecond/MovesPerSecond']=df['PlayoutsPerSecond']/df['MovesPerSecond']
            
            #模型筛选出的无用特征,出现频率少于1%
            drop_cols=['Cooperation', 'Team', 'TriangleShape', 'DiamondShape', 'SpiralShape', 'StarShape', 'SquarePyramidalShape', 'SemiRegularTiling', 'CircleTiling', 'SpiralTiling', 'MancalaThreeRows', 'MancalaSixRows', 'MancalaCircular', 'AlquerqueBoardWithOneTriangle', 'AlquerqueBoardWithTwoTriangles', 'AlquerqueBoardWithFourTriangles', 'AlquerqueBoardWithEightTriangles', 'ThreeMensMorrisBoard', 'ThreeMensMorrisBoardWithTwoTriangles', 'NineMensMorrisBoard', 'StarBoard', 'PachisiBoard', 'Boardless', 'NumColumns', 'NumCorners', 'NumOffDiagonalDirections', 'NumLayers', 'NumCentreSites', 'NumConvexCorners', 'NumPhasesBoard', 'NumContainers', 'Piece', 'PieceValue', 'PieceRotation', 'PieceDirection', 'LargePiece', 'Tile', 'NumComponentsType', 'NumDice', 'OpeningContract', 'SwapOption', 'Repetition', 'TurnKo', 'PositionalSuperko', 'AutoMove', 'InitialRandomPlacement', 'InitialScore', 'InitialCost', 'Moves', 'VoteDecision', 'SwapPlayersDecision', 'SwapPlayersDecisionFrequency', 'ProposeDecision', 'ProposeDecisionFrequency', 'PromotionDecisionFrequency', 'RotationDecision', 'RotationDecisionFrequency', 'StepDecisionToFriend', 'StepDecisionToFriendFrequency', 'StepDecisionToEnemy', 'SlideDecisionToEnemy', 'SlideDecisionToEnemyFrequency', 'SlideDecisionToFriend', 'SlideDecisionToFriendFrequency', 'LeapDecision', 'LeapDecisionFrequency', 'LeapDecisionToEmpty', 'LeapDecisionToEmptyFrequency', 'LeapDecisionToEnemy', 'LeapDecisionToEnemyFrequency', 'HopDecisionFriendToEmpty', 'HopDecisionFriendToEmptyFrequency', 'HopDecisionFriendToFriendFrequency', 'HopDecisionEnemyToEnemy', 'HopDecisionEnemyToEnemyFrequency', 'HopDecisionFriendToEnemy', 'HopDecisionFriendToEnemyFrequency', 'FromToDecisionFrequency', 'FromToDecisionEnemy', 'FromToDecisionEnemyFrequency', 'FromToDecisionFriend', 'SwapPiecesDecision', 'SwapPiecesDecisionFrequency', 'ShootDecision', 'ShootDecisionFrequency', 'VoteEffect', 'SwapPlayersEffect', 'PassEffect', 'ProposeEffect', 'ProposeEffectFrequency', 'AddEffectFrequency', 'SowFrequency', 'SowCapture', 'SowCaptureFrequency', 'SowRemove', 'SowBacktracking', 'SowBacktrackingFrequency', 'SowProperties', 'SowOriginFirst', 'SowCCW', 'PromotionEffectFrequency', 'PushEffect', 'PushEffectFrequency', 'Flip', 'FlipFrequency', 'SetNextPlayer', 'SetValue', 'SetValueFrequency', 'SetCount', 'SetCountFrequency', 'SetRotation', 'SetRotationFrequency', 'StepEffect', 'SlideEffect', 'LeapEffect', 'ByDieMove', 'MaxDistance', 'ReplacementCaptureFrequency', 'HopCaptureMoreThanOne', 'DirectionCapture', 'DirectionCaptureFrequency', 'EncloseCaptureFrequency', 'CustodialCapture', 'CustodialCaptureFrequency', 'InterveneCapture', 'InterveneCaptureFrequency', 'SurroundCapture', 'SurroundCaptureFrequency', 'CaptureSequence', 'CaptureSequenceFrequency', 'Group', 'Loop', 'Pattern', 'PathExtent', 'Territory', 'Fill', 'CanNotMove', 'Threat', 'CountPiecesMoverComparison', 'ProgressCheck', 'RotationalDirection', 'SameLayerDirection', 'ForwardDirection', 'BackwardDirection', 'BackwardsDirection', 'LeftwardDirection', 'RightwardsDirection', 'LeftwardsDirection', 'ForwardLeftDirection', 'ForwardRightDirection', 'BackwardLeftDirection', 'BackwardRightDirection', 'SameDirection', 'OppositeDirection', 'NumPlayPhase', 'LineLoss', 'LineLossFrequency', 'LineDraw', 'ConnectionEnd', 'ConnectionEndFrequency', 'ConnectionWinFrequency', 'ConnectionLoss', 'ConnectionLossFrequency', 'GroupEnd', 'GroupEndFrequency', 'GroupWin', 'GroupWinFrequency', 'GroupLoss', 'GroupDraw', 'LoopEnd', 'LoopEndFrequency', 'LoopWin', 'LoopWinFrequency', 'LoopLoss', 'PatternEnd', 'PatternEndFrequency', 'PatternWin', 'PatternWinFrequency', 'PathExtentEnd', 'PathExtentWin', 'PathExtentLoss', 'TerritoryEnd', 'TerritoryWin', 'TerritoryWinFrequency', 'Checkmate', 'CheckmateWin', 'NoTargetPieceEndFrequency', 'NoTargetPieceWin', 'NoTargetPieceWinFrequency', 'EliminatePiecesLoss', 'EliminatePiecesLossFrequency', 'EliminatePiecesDraw', 'EliminatePiecesDrawFrequency', 'NoOwnPiecesEnd', 'NoOwnPiecesWin', 'NoOwnPiecesLoss', 'NoOwnPiecesLossFrequency', 'FillEnd', 'FillEndFrequency', 'FillWin', 'FillWinFrequency', 'ReachWin', 'ReachLoss', 'ReachLossFrequency', 'ReachDraw', 'ReachDrawFrequency', 'ScoringLoss', 'ScoringLossFrequency', 'ScoringDraw', 'NoMovesLoss', 'NoMovesDrawFrequency', 'NoProgressEnd', 'NoProgressEndFrequency', 'NoProgressDraw', 'NoProgressDrawFrequency', 'BoardCoverageFull', 'BoardSitesOccupiedChangeNumTimes', 'BranchingFactorChangeLineBestFit', 'BranchingFactorChangeNumTimesn', 'DecisionFactorChangeNumTimes', 'MoveDistanceChangeSign', 'MoveDistanceChangeLineBestFit', 'PieceNumberChangeNumTimes', 'PieceNumberMaxIncrease', 'ScoreDifferenceMedian', 'ScoreDifferenceVariance', 'ScoreDifferenceChangeAverage', 'ScoreDifferenceChangeSign', 'ScoreDifferenceChangeLineBestFit', 'Math', 'Division', 'Modulo', 'Absolute', 'Exponentiation', 'Minimum', 'Maximum', 'Even', 'Odd', 'Visual', 'GraphStyle', 'MancalaStyle', 'PenAndPaperStyle', 'ShibumiStyle', 'BackgammonStyle', 'JanggiStyle', 'XiangqiStyle', 'ShogiStyle', 'TableStyle', 'SurakartaStyle', 'NoBoard', 'ChessComponent', 'KingComponent', 'QueenComponent', 'KnightComponent', 'RookComponent', 'BishopComponent', 'PawnComponent', 'FairyChessComponent', 'PloyComponent', 'ShogiComponent', 'XiangqiComponent', 'StrategoComponent', 'JanggiComponent', 'TaflComponent', 'StackType', 'Stack', 'ShowPieceValue', 'ShowPieceState', 'Implementation', 'StateType', 'StackState', 'VisitedSites', 'InternalCounter', 'SetInternalCounter', 'Efficiency', 'NumOffDiagonalDirections_0.0', 'NumOffDiagonalDirections_4.82', 'NumOffDiagonalDirections_2.0', 'NumOffDiagonalDirections_5.18', 'NumOffDiagonalDirections_3.08', 'NumOffDiagonalDirections_0.06', 'NumLayers_1', 'NumLayers_0', 'NumLayers_4', 'NumLayers_5', 'NumPhasesBoard_1', 'NumPhasesBoard_5', 'NumDice_0', 'NumDice_2', 'NumDice_6', 'NumDice_3', 'NumDice_5', 'NumDice_7', 'ProposeDecisionFrequency_0.0', 'ProposeDecisionFrequency_0.05', 'ProposeDecisionFrequency_0.01', 'PromotionDecisionFrequency_0.0', 'PromotionDecisionFrequency_0.01', 'PromotionDecisionFrequency_0.03', 'PromotionDecisionFrequency_0.02', 'PromotionDecisionFrequency_0.11', 'PromotionDecisionFrequency_0.05', 'PromotionDecisionFrequency_0.04', 'SlideDecisionToFriendFrequency_0.0', 'SlideDecisionToFriendFrequency_0.19', 'SlideDecisionToFriendFrequency_0.06', 'LeapDecisionToEnemyFrequency_0.0', 'LeapDecisionToEnemyFrequency_0.04', 'LeapDecisionToEnemyFrequency_0.01', 'LeapDecisionToEnemyFrequency_0.02', 'LeapDecisionToEnemyFrequency_0.07', 'LeapDecisionToEnemyFrequency_0.03', 'LeapDecisionToEnemyFrequency_0.14', 'LeapDecisionToEnemyFrequency_0.08', 'HopDecisionFriendToFriendFrequency_0.0', 'HopDecisionFriendToFriendFrequency_0.13', 'HopDecisionFriendToFriendFrequency_0.09', 'HopDecisionEnemyToEnemyFrequency_0.0', 'HopDecisionEnemyToEnemyFrequency_0.01', 'HopDecisionEnemyToEnemyFrequency_0.2', 'HopDecisionEnemyToEnemyFrequency_0.03', 'HopDecisionFriendToEnemyFrequency_0.0', 'HopDecisionFriendToEnemyFrequency_0.01', 'HopDecisionFriendToEnemyFrequency_0.09', 'HopDecisionFriendToEnemyFrequency_0.25', 'HopDecisionFriendToEnemyFrequency_0.02', 'FromToDecisionFrequency_0.0', 'FromToDecisionFrequency_0.38', 'FromToDecisionFrequency_1.0', 'FromToDecisionFrequency_0.31', 'FromToDecisionFrequency_0.94', 'FromToDecisionFrequency_0.67', 'ProposeEffectFrequency_0.0', 'ProposeEffectFrequency_0.01', 'ProposeEffectFrequency_0.03', 'PushEffectFrequency_0.0', 'PushEffectFrequency_0.5', 'PushEffectFrequency_0.96', 'PushEffectFrequency_0.25', 'FlipFrequency_0.0', 'FlipFrequency_0.87', 'FlipFrequency_1.0', 'FlipFrequency_0.96', 'SetCountFrequency_0.0', 'SetCountFrequency_0.62', 'SetCountFrequency_0.54', 'SetCountFrequency_0.02', 'DirectionCaptureFrequency_0.0', 'DirectionCaptureFrequency_0.55', 'DirectionCaptureFrequency_0.54', 'EncloseCaptureFrequency_0.0', 'EncloseCaptureFrequency_0.08', 'EncloseCaptureFrequency_0.1', 'EncloseCaptureFrequency_0.07', 'EncloseCaptureFrequency_0.12', 'EncloseCaptureFrequency_0.02', 'EncloseCaptureFrequency_0.09', 'InterveneCaptureFrequency_0.0', 'InterveneCaptureFrequency_0.01', 'InterveneCaptureFrequency_0.14', 'InterveneCaptureFrequency_0.04', 'SurroundCaptureFrequency_0.0', 'SurroundCaptureFrequency_0.01', 'SurroundCaptureFrequency_0.03', 'SurroundCaptureFrequency_0.02', 'NumPlayPhase_3', 'NumPlayPhase_4', 'NumPlayPhase_5', 'NumPlayPhase_6', 'NumPlayPhase_7', 'NumPlayPhase_8', 'LineLossFrequency_0.0', 'LineLossFrequency_0.96', 'LineLossFrequency_0.87', 'LineLossFrequency_0.46', 'LineLossFrequency_0.26', 'LineLossFrequency_0.88', 'LineLossFrequency_0.94', 'ConnectionEndFrequency_0.0', 'ConnectionEndFrequency_0.19', 'ConnectionEndFrequency_1.0', 'ConnectionEndFrequency_0.23', 'ConnectionEndFrequency_0.94', 'ConnectionEndFrequency_0.35', 'ConnectionEndFrequency_0.97', 'ConnectionLossFrequency_0.0', 'ConnectionLossFrequency_0.54', 'ConnectionLossFrequency_0.78', 'GroupEndFrequency_0.0', 'GroupEndFrequency_1.0', 'GroupEndFrequency_0.11', 'GroupEndFrequency_0.79', 'GroupWinFrequency_0.0', 'GroupWinFrequency_0.11', 'GroupWinFrequency_1.0', 'LoopEndFrequency_0.0', 'LoopEndFrequency_0.14', 'LoopEndFrequency_0.66', 'LoopWinFrequency_0.0', 'LoopWinFrequency_0.14', 'LoopWinFrequency_0.66', 'PatternEndFrequency_0.0', 'PatternEndFrequency_0.63', 'PatternEndFrequency_0.35', 'PatternWinFrequency_0.0', 'PatternWinFrequency_0.63', 'PatternWinFrequency_0.35', 'NoTargetPieceWinFrequency_0.0', 'NoTargetPieceWinFrequency_0.72', 'NoTargetPieceWinFrequency_0.77', 'NoTargetPieceWinFrequency_0.95', 'NoTargetPieceWinFrequency_0.32', 'NoTargetPieceWinFrequency_1.0', 'EliminatePiecesLossFrequency_0.0', 'EliminatePiecesLossFrequency_0.85', 'EliminatePiecesLossFrequency_0.96', 'EliminatePiecesLossFrequency_0.68', 'EliminatePiecesDrawFrequency_0.0', 'EliminatePiecesDrawFrequency_0.03', 'EliminatePiecesDrawFrequency_0.91', 'EliminatePiecesDrawFrequency_1.0', 'EliminatePiecesDrawFrequency_0.36', 'EliminatePiecesDrawFrequency_0.86', 'NoOwnPiecesLossFrequency_0.0', 'NoOwnPiecesLossFrequency_1.0', 'NoOwnPiecesLossFrequency_0.68', 'FillEndFrequency_0.0', 'FillEndFrequency_1.0', 'FillEndFrequency_0.04', 'FillEndFrequency_0.01', 'FillEndFrequency_0.99', 'FillEndFrequency_0.72', 'FillWinFrequency_0.0', 'FillWinFrequency_1.0', 'FillWinFrequency_0.04', 'FillWinFrequency_0.01', 'FillWinFrequency_0.99', 'ReachDrawFrequency_0.0', 'ReachDrawFrequency_0.9', 'ReachDrawFrequency_0.98', 'ScoringLossFrequency_0.0', 'ScoringLossFrequency_0.6', 'ScoringLossFrequency_0.62', 'NoMovesLossFrequency_0.0', 'NoMovesLossFrequency_1.0', 'NoMovesLossFrequency_0.13', 'NoMovesLossFrequency_0.06', 'NoMovesDrawFrequency_0.0', 'NoMovesDrawFrequency_0.01', 'NoMovesDrawFrequency_0.04', 'NoMovesDrawFrequency_0.03', 'NoMovesDrawFrequency_0.22', 'BoardSitesOccupiedChangeNumTimes_0.0', 'BoardSitesOccupiedChangeNumTimes_0.06', 'BoardSitesOccupiedChangeNumTimes_0.42', 'BoardSitesOccupiedChangeNumTimes_0.12', 'BoardSitesOccupiedChangeNumTimes_0.14', 'BoardSitesOccupiedChangeNumTimes_0.94', 'BranchingFactorChangeNumTimesn_0.0', 'BranchingFactorChangeNumTimesn_0.3', 'BranchingFactorChangeNumTimesn_0.02', 'BranchingFactorChangeNumTimesn_0.07', 'BranchingFactorChangeNumTimesn_0.04', 'BranchingFactorChangeNumTimesn_0.13', 'BranchingFactorChangeNumTimesn_0.01', 'BranchingFactorChangeNumTimesn_0.21', 'BranchingFactorChangeNumTimesn_0.03', 'PieceNumberChangeNumTimes_0.0', 'PieceNumberChangeNumTimes_0.06', 'PieceNumberChangeNumTimes_0.42', 'PieceNumberChangeNumTimes_0.12', 'PieceNumberChangeNumTimes_0.14', 'PieceNumberChangeNumTimes_1.0', 'KintsBoard', 'FortyStonesWithFourGapsBoard', 'Roll', 'SumDice', 'CheckmateFrequency', 'NumDice_4']
            
            df.drop(['Id',#Id没有什么实际的意义,就是行号
            #在train里面nunique=1的列
            'Properties', 'Format', 'Time', 'Discrete', 'Realtime', 'Turns', 'Alternating', 'Simultaneous', 'HiddenInformation', 'Match', 'AsymmetricRules', 'AsymmetricPlayRules', 'AsymmetricEndRules', 'AsymmetricSetup', 'Players', 'NumPlayers', 'Simulation', 'Solitaire', 'TwoPlayer', 'Multiplayer', 'Coalition', 'Puzzle', 'DeductionPuzzle', 'PlanningPuzzle', 'Equipment', 'Container', 'Board', 'PrismShape', 'ParallelogramShape', 'RectanglePyramidalShape', 'TargetShape', 'BrickTiling', 'CelticTiling', 'QuadHexTiling', 'Hints', 'PlayableSites', 'Component', 'DiceD3', 'BiasedDice', 'Card', 'Domino', 'Rules', 'SituationalTurnKo', 'SituationalSuperko', 'InitialAmount', 'InitialPot', 'Play', 'BetDecision', 'BetDecisionFrequency', 'VoteDecisionFrequency', 'ChooseTrumpSuitDecision', 'ChooseTrumpSuitDecisionFrequency', 'LeapDecisionToFriend', 'LeapDecisionToFriendFrequency', 'HopDecisionEnemyToFriend', 'HopDecisionEnemyToFriendFrequency', 'HopDecisionFriendToFriend', 'FromToDecisionWithinBoard', 'FromToDecisionBetweenContainers', 'BetEffect', 'BetEffectFrequency', 'VoteEffectFrequency', 'SwapPlayersEffectFrequency', 'TakeControl', 'TakeControlFrequency', 'PassEffectFrequency', 'SetCost', 'SetCostFrequency', 'SetPhase', 'SetPhaseFrequency', 'SetTrumpSuit', 'SetTrumpSuitFrequency', 'StepEffectFrequency', 'SlideEffectFrequency', 'LeapEffectFrequency', 'HopEffectFrequency', 'FromToEffectFrequency', 'SwapPiecesEffect', 'SwapPiecesEffectFrequency', 'ShootEffect', 'ShootEffectFrequency', 'MaxCapture', 'OffDiagonalDirection', 'Information', 'HidePieceType', 'HidePieceOwner', 'HidePieceCount', 'HidePieceRotation', 'HidePieceValue', 'HidePieceState', 'InvisiblePiece', 'End', 'LineDrawFrequency', 'ConnectionDraw', 'ConnectionDrawFrequency', 'GroupLossFrequency', 'GroupDrawFrequency', 'LoopLossFrequency', 'LoopDraw', 'LoopDrawFrequency', 'PatternLoss', 'PatternLossFrequency', 'PatternDraw', 'PatternDrawFrequency', 'PathExtentEndFrequency', 'PathExtentWinFrequency', 'PathExtentLossFrequency', 'PathExtentDraw', 'PathExtentDrawFrequency', 'TerritoryLoss', 'TerritoryLossFrequency', 'TerritoryDraw', 'TerritoryDrawFrequency', 'CheckmateLoss', 'CheckmateLossFrequency', 'CheckmateDraw', 'CheckmateDrawFrequency', 'NoTargetPieceLoss', 'NoTargetPieceLossFrequency', 'NoTargetPieceDraw', 'NoTargetPieceDrawFrequency', 'NoOwnPiecesDraw', 'NoOwnPiecesDrawFrequency', 'FillLoss', 'FillLossFrequency', 'FillDraw', 'FillDrawFrequency', 'ScoringDrawFrequency', 'NoProgressWin', 'NoProgressWinFrequency', 'NoProgressLoss', 'NoProgressLossFrequency', 'SolvedEnd', 'Behaviour', 'StateRepetition', 'PositionalRepetition', 'SituationalRepetition', 'Duration', 'Complexity', 'BoardCoverage', 'GameOutcome', 'StateEvaluation', 'Clarity', 'Narrowness', 'Variance', 'Decisiveness', 'DecisivenessMoves', 'DecisivenessThreshold', 'LeadChange', 'Stability', 'Drama', 'DramaAverage', 'DramaMedian', 'DramaMaximum', 'DramaMinimum', 'DramaVariance', 'DramaChangeAverage', 'DramaChangeSign', 'DramaChangeLineBestFit', 'DramaChangeNumTimes', 'DramaMaxIncrease', 'DramaMaxDecrease', 'MoveEvaluation', 'MoveEvaluationAverage', 'MoveEvaluationMedian', 'MoveEvaluationMaximum', 'MoveEvaluationMinimum', 'MoveEvaluationVariance', 'MoveEvaluationChangeAverage', 'MoveEvaluationChangeSign', 'MoveEvaluationChangeLineBestFit', 'MoveEvaluationChangeNumTimes', 'MoveEvaluationMaxIncrease', 'MoveEvaluationMaxDecrease', 'StateEvaluationDifference', 'StateEvaluationDifferenceAverage', 'StateEvaluationDifferenceMedian', 'StateEvaluationDifferenceMaximum', 'StateEvaluationDifferenceMinimum', 'StateEvaluationDifferenceVariance', 'StateEvaluationDifferenceChangeAverage', 'StateEvaluationDifferenceChangeSign', 'StateEvaluationDifferenceChangeLineBestFit', 'StateEvaluationDifferenceChangeNumTimes', 'StateEvaluationDifferenceMaxIncrease', 'StateEvaluationDifferenceMaxDecrease', 'BoardSitesOccupied', 'BoardSitesOccupiedMinimum', 'BranchingFactor', 'BranchingFactorMinimum', 'DecisionFactor', 'DecisionFactorMinimum', 'MoveDistance', 'MoveDistanceMinimum', 'PieceNumber', 'PieceNumberMinimum', 'ScoreDifference', 'ScoreDifferenceMinimum', 'ScoreDifferenceChangeNumTimes', 'Roots', 'Cosine', 'Sine', 'Tangent', 'Exponential', 'Logarithm', 'ExclusiveDisjunction', 'Float', 'HandComponent', 'SetHidden', 'SetInvisible', 'SetHiddenCount', 'SetHiddenRotation', 'SetHiddenState', 'SetHiddenValue', 'SetHiddenWhat', 'SetHiddenWho',
            #训练集里有但是测试集里没有的列
            'num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1',
            #object列
            'Behaviour', 'StateRepetition', 'Duration', 'Complexity', 'BoardCoverage', 'GameOutcome', 'StateEvaluation', 'Clarity', 'Decisiveness', 'Drama', 'MoveEvaluation', 'StateEvaluationDifference', 'BoardSitesOccupied', 'BranchingFactor', 'DecisionFactor', 'MoveDistance', 'PieceNumber', 'ScoreDifference','selection1', 'selection2', 'exploration_const1', 'exploration_const2', 'playout1', 'playout2', 'score_bounds1', 'score_bounds2',
            ]+drop_cols,axis=1,inplace=True,errors='ignore')#对于测试集中没有的列可以直接忽略 
            
            df=self.reduce_mem_usage(df)
            print(f"feature_count:{len(df.columns)}")
            print("-"*30)
            return df

        def CV_feats(self,df,mode='',model_name='',fold=0):
            str_cols=['EnglishRules', 'LudRules']#'agent1','agent2',
            for col in str_cols:
                df=self.clean(df,col)
                df[f'{col}_len']=df[col].apply(len)
                if mode=='train':
                    tfidf = TfidfVectorizer(max_features=500,ngram_range=(2,3))
                    tfidf_feats=tfidf.fit_transform(df[col]).toarray()
                    for i in range(tfidf_feats.shape[1]):
                        df[f"{col}_tfidf_{i}"]=tfidf_feats[:,i]
                    self.pickle_dump(tfidf,f'{model_name}_{fold}_{col}tfidf.model')
                    self.tfidf_paths.append((model_name,fold,col))
                else:#mode=='test'
                    for i in range(len(self.tfidf_paths)):
                        if (model_name,fold,col)==self.tfidf_paths[i]:
                            tfidf=self.pickle_load(f'{model_name}_{fold}_{col}tfidf.model')
                            tfidf_feats=tfidf.transform(df[col]).toarray()
                            for j in range(tfidf_feats.shape[1]):
                                df[f"{col}_tfidf_{j}"]=tfidf_feats[:,j]
            df.drop(str_cols+['agent1','agent2'],axis=1,inplace=True)
            return df 
        
        def RMSE(self,y_true,y_pred):
            return np.sqrt(np.mean((y_true-y_pred)**2))
        
        def train_model(self,):
            self.train=self.FE(self.train,mode='train')
            #https://www.kaggle.com/code/ravi20076/mcts2024-mlmodels-v1/notebook
            cat_params1={'task_type'           : "GPU",
                'eval_metric'         : "RMSE",
                'bagging_temperature' : 0.50,
                'iterations'          : 100 if APP.small_iterations else 3096,
                'learning_rate'       : 0.08,
                'max_depth'           : 12,
                'l2_leaf_reg'         : 1.25,
                'min_data_in_leaf'    : 24,
                'random_strength'     : 0.25, 
                'verbose'             : 0,
                }
            
            cat_params2={'task_type'           : "GPU",
                'eval_metric'         : "RMSE",
                'bagging_temperature' : 0.60,
                'iterations'          : 100 if APP.small_iterations else 3096,
                'learning_rate'       : 0.08,
                'max_depth'           : 12,
                'l2_leaf_reg'         : 1.25,
                'min_data_in_leaf'    : 24,
                'random_strength'     : 0.20, 
                'max_bin'             :2048,
                'verbose'             : 0,
                }
            models=[
                    (CatBoostRegressor(**cat_params1),'cat1'),
                    (CatBoostRegressor(**cat_params2),'cat2'),
                ]
            if APP.short_dataset:
                self.train = self.train[:1000]
            for (model,model_name) in models:
                print("start training")
                X=self.train.drop([self.target,'GameRulesetName'],axis=1)
                GameRulesetName=self.train['GameRulesetName']
                y=self.train[self.target]
                oof_preds=np.zeros(len(X))
                
                y_int=round(y*15)
                
                sgkf = StratifiedGroupKFold(n_splits=self.num_folds,random_state=2024,shuffle=True)

                for fold, (train_index, valid_index) in (enumerate(sgkf.split(X,y_int,GameRulesetName))):
                    print(f"fold:{fold}")

                    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

                    X_train=self.CV_feats(X_train,mode='train',model_name=model_name,fold=fold)
                    X_valid=self.CV_feats(X_valid,mode='test',model_name=model_name,fold=fold)

                    model.fit(X_train, y_train,
                        eval_set=(X_valid, y_valid),
                        early_stopping_rounds=100, verbose=100)
                    
                    oof_preds[valid_index]=model.predict(X_valid)

                    self.pickle_dump(model,f'{model_name}_{fold}.model')
                    self.model_paths.append((model_name,fold))

                    del X_train,X_valid,y_train,y_valid
                    gc.collect()
                
                np.save(f"{model_name}_oof.npy",np.clip(oof_preds*1.1,-0.985,0.985))
                
                print(f"RMSE:{self.RMSE(y.values,np.clip(oof_preds*1.1,-0.985,0.985) )}")
                
        def infer_model(self,test):
            test=self.FE(test,mode='test')
            test.drop(['GameRulesetName'],axis=1,inplace=True)
            test_preds=[]
            for i in range(len(self.model_paths)):
                model_name,fold=self.model_paths[i]
                test_copy=self.CV_feats(test.copy(),mode='test',model_name=model_name,fold=fold)
                model=self.pickle_load(f'{model_name}_{fold}.model')
                test_preds+=[np.clip(model.predict(test_copy)*1.1,-0.985,0.985)]
            return np.mean(test_preds,axis=0)
        
    preprocessor=Preprocessor(num_folds=5,train=train)
    counter = 0
    def predict(test, submission):
        if model_1.counter == 0:
            model_1.preprocessor.train_model()  
        model_1.counter += 1
        return model_1.preprocessor.infer_model(test.to_pandas())

len(train):233234
len(test):3


# Model 2: [yukiZ - mcts-oof-predictions-as-features-hp-tune-427 v02 0.427](https://www.kaggle.com/code/hideyukizushi/mcts-oof-predictions-as-features-hp-tune-427?scriptVersionId=202448545)

In [3]:
class model_2:
    class CFG:
        importances_path = Path('/kaggle/input/mcts-gbdt-select-210-features/importances.csv')    
        train_path = Path('/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv')
        batch_size = 65536
        early_stop = 500
        n_splits = 5
        color = '#C9A9A6'
        lgb_w = 0.80
        lgb_p = {
            'objective': 'regression',
            'min_child_samples': 24,
            'num_iterations': 200 if APP.small_iterations else 20000,
            'learning_rate': 0.07,
            'extra_trees': True,
            'reg_lambda': 0.8,
            'reg_alpha': 0.1,
            'num_leaves': 64,
            'metric': 'rmse',
            'device': 'cpu',
            'max_depth': 24,
            'max_bin': 128,
            'verbose': -1,
            'seed': 42
        }
        
        ctb_w = 0.30
        ctb_p = {
            'loss_function': 'RMSE',
            'learning_rate': 0.03,
            'num_trees': 200 if APP.small_iterations else 20000,
            'random_state': 42,
            'task_type': 'CPU',
            'reg_lambda': 0.8,
            'depth': 8
        }

    class FE:
        def __init__(self, batch_size):
            self.batch_size = batch_size
            
        def drop_cols(self, df, bad_cols=None): # bad_cols must be provided when processing the test data
            # Define redundant columns for model development
            cols = ['Id', 
                    'LudRules', 
                    'EnglishRules',
                    'num_wins_agent1',
                    'num_draws_agent1',
                    'num_losses_agent1']
            
            df = df.drop([col for col in cols if col in df.columns])
            
            # Select and drop columns with 100% null values
            df = df.drop([col for col in df.columns if df.select(pl.col(col).null_count()).item() == df.height])
            
            # Select (if not provided) and drop columns with only one unique value
            bad_cols = [col for col in df.columns if df.select(pl.col(col).n_unique()).item() == 1] if bad_cols is None else bad_cols
            df = df.drop(bad_cols)
            return df, bad_cols
        
        def cast_datatypes(self, df):
            # Set datatype for categorical columns
            cat_cols = ['GameRulesetName', 'agent1', 'agent2']
            df = df.with_columns([pl.col(col).cast(pl.String) for col in cat_cols])   
            
            # Find numeric columns
            for col in df.columns:
                if col not in cat_cols:
                    # Set datatype for a numeric column as per the datatype of the first non-null item
                    val = df.select(pl.col(col).drop_nulls().first()).item()
                    df = df.with_columns(pl.col(col).cast(pl.Int16) if isinstance(val, int) else pl.col(col).cast(pl.Float32))   
            return df    
        
        def info(self, df):
            print(f'Shape: {df.shape}')   
            mem = df.estimated_size() / 1024**2
            print('Memory usage: {:.2f} MB\n'.format(mem))
            
        def apply_fe(self, path):            
            df = pl.read_csv(path, batch_size=self.batch_size)
            df, bad_cols = self.drop_cols(df)
            df = self.cast_datatypes(df)
            self.info(df)
            cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
            return df, bad_cols, cat_cols

    fe = FE(CFG.batch_size)

    class MD:
        def __init__(self, 
                    importances_path, 
                    early_stop, 
                    n_splits, 
                    lgb_w, 
                    lgb_p, 
                    ctb_w, 
                    ctb_p, 
                    color):
            self.importances_path = importances_path
            self.early_stop = early_stop
            self.n_splits = n_splits
            self.lgb_w = lgb_w
            self.lgb_p = lgb_p
            self.ctb_w = ctb_w
            self.ctb_p = ctb_p
            self.color = color
            
        def plot_cv(self, fold_scores, title):
            fold_scores = [round(score, 3) for score in fold_scores]
            mean_score = round(np.mean(fold_scores), 3)
            std_score = round(np.std(fold_scores), 3)

            fig = go.Figure()

            fig.add_trace(go.Scatter(
                x = list(range(1, len(fold_scores) + 1)),
                y = fold_scores,
                mode = 'markers', 
                name = 'Fold Scores',
                marker = dict(size = 24, color=self.color, symbol='diamond'),
                text = [f'{score:.3f}' for score in fold_scores],
                hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
                hoverlabel=dict(font=dict(size=16))  
            ))

            fig.add_trace(go.Scatter(
                x = [1, len(fold_scores)],
                y = [mean_score, mean_score],
                mode = 'lines',
                name = f'Mean: {mean_score:.3f}',
                line = dict(dash = 'dash', color = '#FFBF00'),
                hoverinfo = 'none'
            ))

            fig.update_layout(
                title = f'{title} | Cross-Validation RMSE Scores | Variation of CV scores: {mean_score} ± {std_score}',
                xaxis_title = 'Fold',
                yaxis_title = 'RMSE Score',
                plot_bgcolor = 'rgba(0,0,0,0)',
                paper_bgcolor = 'rgba(0,0,0,0)',
                xaxis = dict(
                    gridcolor = 'lightgray',
                    tickmode = 'linear',
                    tick0 = 1,
                    dtick = 1,
                    range = [0.5, len(fold_scores) + 0.5]
                ),
                yaxis = dict(gridcolor = 'lightgray')
            )
            fig.show() 
            
        def train_model(self, data, cat_cols, title):
            importances = pd.read_csv(self.importances_path)
            
            for col in cat_cols:
                data[col] = data[col].astype('category')
            
            # Define features (X), label (y) and grouping column (group) for CV
            X = data.drop(['utility_agent1'], axis=1)
            y = data['utility_agent1']
            group = data['GameRulesetName']
            cv = GroupKFold(n_splits=self.n_splits)
            models, scores = [], []
            
            # Initialize out-of-fold predictions array
            oof_preds = np.zeros(len(X))
            
            for fold, (train_index, valid_index) in enumerate(cv.split(X, y, group)):
                drop_features = importances['drop_features'].tolist()
                X_train, X_valid = X.iloc[train_index].drop(drop_features, axis=1), X.iloc[valid_index].drop(drop_features, axis=1)
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
                print(f'Fold {fold+1} | {X_train.shape[0]:,} train rows | {X_valid.shape[0]:,} valid rows | {X_train.shape[1]} features')
                    
                if title.startswith('LightGBM'):
                    model = lgb.LGBMRegressor(**self.lgb_p)

                    model.fit(X_train, y_train,
                            eval_set=[(X_valid, y_valid)],
                            eval_metric='rmse',
                            callbacks=[lgb.early_stopping(self.early_stop, verbose=0), lgb.log_evaluation(0)])
                
                elif title.startswith('CatBoost'):
                    model = CatBoostRegressor(**self.ctb_p, verbose=0, cat_features=cat_cols)

                    model.fit(X_train, y_train,
                            eval_set=(X_valid, y_valid),
                            early_stopping_rounds=self.early_stop, verbose=0)

                models.append(model)

                # Store out-of-fold predictions for this fold
                oof_preds[valid_index] = model.predict(X_valid)
                score = mse(y_valid, oof_preds[valid_index], squared=False)
                scores.append(score)
            
            self.plot_cv(scores, title)
            return models, oof_preds
        
        def inference(self, data, cat_cols, lgb_models, ctb_models, lgb_models_oof, ctb_models_oof):
            importances = pd.read_csv(self.importances_path)
                
            drop_features = importances['drop_features'].tolist()
            data = data.drop(drop_features, axis=1)

            for col in cat_cols:
                data[col] = data[col].astype('category')
                    
            data['lgb_oof_preds'] = np.mean([model.predict(data) for model in lgb_models], axis=0)
            data['ctb_oof_preds'] = np.mean([model.predict(data) for model in ctb_models], axis=0)
            
            lgb_preds = np.mean([model.predict(data) for model in lgb_models_oof], axis=0)  
            ctb_preds = np.mean([model.predict(data) for model in ctb_models_oof], axis=0)    
            
            return lgb_preds * self.lgb_w + ctb_preds * self.ctb_w
        
    md = MD(CFG.importances_path, 
            CFG.early_stop, 
            CFG.n_splits, 
            CFG.lgb_w, 
            CFG.lgb_p, 
            CFG.ctb_w, 
            CFG.ctb_p, 
            CFG.color)

    bad_cols = None
    cat_cols = None
    lgb_models = None
    ctb_models = None
    lgb_models_oof = None
    ctb_models_oof = None

    def train_model():            
        train, model_2.bad_cols, model_2.cat_cols = model_2.fe.apply_fe(model_2.CFG.train_path)
        train = train.to_pandas()
        if APP.short_dataset:
            train = train[:1000]
        model_2.lgb_models, lgb_oof_preds = model_2.md.train_model(train, model_2.cat_cols, title='LightGBM')
        model_2.ctb_models, ctb_oof_preds = model_2.md.train_model(train, model_2.cat_cols, title='CatBoost')
        train['lgb_oof_preds'] = lgb_oof_preds
        train['ctb_oof_preds'] = ctb_oof_preds
        model_2.lgb_models_oof, _ = model_2.md.train_model(train, model_2.cat_cols, title='LightGBM w/ OOF')
        model_2.ctb_models_oof, _ = model_2.md.train_model(train, model_2.cat_cols, title='CatBoost w/ OOF')

    counter = 0
    def predict(test, submission):
        if model_2.counter == 0:
            model_2.train_model() 
        model_2.counter += 1
        test, _ = model_2.fe.drop_cols(test, model_2.bad_cols)
        test = model_2.fe.cast_datatypes(test)
        test = test.to_pandas()
        return model_2.md.inference(test, model_2.cat_cols, model_2.lgb_models, model_2.ctb_models, model_2.lgb_models_oof, model_2.ctb_models_oof)

In [4]:
def predict(test, submission):
    result_1 = model_1.predict(test, submission)
    result_2 = model_2.predict(test, submission)
    return submission.with_columns(pl.Series('utility_agent1', result_1 * 0.72 + result_2 * 0.38))

if APP.local and not APP.submit:
    test = pl.read_csv(APP.test_file)
    submission = pl.read_csv(APP.sample_subm_file)
    result = predict(test, submission)
else:
    # Call the gateway server
    inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)
    if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
        inference_server.serve()
    else:
        inference_server.run_local_gateway((APP.test_file, APP.sample_subm_file))


FE:train
agent position feature
deal with outliers
agent1 agent2 feature
one_hot_encoder
deal with LudRules
1:drop game
2:player
Rules readable
Memory usage of dataframe is 756.04 MB
Memory usage after optimization is: 253.12 MB
Decreased by 66.5%
feature_count:452
------------------------------
start training
fold:0
0:	learn: 0.3271379	test: 0.7948494	best: 0.7948494 (0)	total: 9.4s	remaining: 15m 31s
99:	learn: 0.0781368	test: 0.7808308	best: 0.7802094 (54)	total: 11.6s	remaining: 0us
bestTest = 0.7802093566
bestIteration = 54
Shrink model to first 55 iterations.
fold:1
0:	learn: 0.4482654	test: 0.6354035	best: 0.6354035 (0)	total: 22.3ms	remaining: 2.2s
99:	learn: 0.0780276	test: 0.4563187	best: 0.4563187 (99)	total: 2.13s	remaining: 0us
bestTest = 0.4563187012
bestIteration = 99
fold:2
0:	learn: 0.4806532	test: 0.4005700	best: 0.4005700 (0)	total: 23.7ms	remaining: 2.35s
99:	learn: 0.0658512	test: 0.3592692	best: 0.3592382 (98)	total: 2.17s	remaining: 0us
bestTest = 0.3592382254
be

Fold 1 | 802 train rows | 198 valid rows | 302 features
Fold 2 | 824 train rows | 176 valid rows | 302 features
Fold 3 | 782 train rows | 218 valid rows | 302 features
Fold 4 | 766 train rows | 234 valid rows | 302 features
Fold 5 | 826 train rows | 174 valid rows | 302 features


Fold 1 | 802 train rows | 198 valid rows | 304 features
Fold 2 | 824 train rows | 176 valid rows | 304 features
Fold 3 | 782 train rows | 218 valid rows | 304 features
Fold 4 | 766 train rows | 234 valid rows | 304 features
Fold 5 | 826 train rows | 174 valid rows | 304 features


Fold 1 | 802 train rows | 198 valid rows | 304 features
Fold 2 | 824 train rows | 176 valid rows | 304 features
Fold 3 | 782 train rows | 218 valid rows | 304 features
Fold 4 | 766 train rows | 234 valid rows | 304 features
Fold 5 | 826 train rows | 174 valid rows | 304 features
