In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/concepts.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/__init__.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_inference_server.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/templates.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/base_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/relay.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evalua

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [3]:
train = import_data('/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv')
test = import_data('/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv')
concepts = import_data('/kaggle/input/um-game-playing-strength-of-mcts-variants/concepts.csv')

  df = pd.read_csv(file, parse_dates=True, keep_date_col=True)


Memory usage of dataframe is 1448.46 MB
Memory usage after optimization is: 259.71 MB
Decreased by 82.1%
Memory usage of dataframe is 0.02 MB


  df = pd.read_csv(file, parse_dates=True, keep_date_col=True)


Memory usage after optimization is: 0.00 MB
Decreased by 79.1%
Memory usage of dataframe is 0.06 MB
Memory usage after optimization is: 0.12 MB
Decreased by -121.2%


  df = pd.read_csv(file, parse_dates=True, keep_date_col=True)


In [4]:

train.info()  # Get a summary of the dataset including column types and non-null counts


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233234 entries, 0 to 233233
Columns: 814 entries, Id to utility_agent1
dtypes: category(5), float16(170), float32(13), float64(18), int16(12), int32(1), int8(595)
memory usage: 259.7 MB


In [5]:
missing_percentage = train.isnull().mean() * 100
# Drop columns with more than 50% missing values
train = train.drop(columns=missing_percentage[missing_percentage > 50].index)
train.head()

Unnamed: 0,Id,GameRulesetName,agent1,agent2,Properties,Format,Time,Discrete,Realtime,Turns,...,DoLudeme,Trigger,PlayoutsPerSecond,MovesPerSecond,EnglishRules,LudRules,num_wins_agent1,num_draws_agent1,num_losses_agent1,utility_agent1
0,0,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-ProgressiveHistory-0.6-Random200-false,1,1,1,1,0,1,...,0,1,298.070007,18877.169922,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",4,0,11,-0.466553
1,1,00Y,MCTS-ProgressiveHistory-0.1-MAST-false,MCTS-UCB1GRAVE-0.6-NST-true,1,1,1,1,0,1,...,0,1,298.070007,18877.169922,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333252
2,2,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.1-NST-false,1,1,1,1,0,1,...,0,1,298.070007,18877.169922,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",7,0,8,-0.06665
3,3,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1-0.6-NST-false,1,1,1,1,0,1,...,0,1,298.070007,18877.169922,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333252
4,4,00Y,MCTS-ProgressiveHistory-0.1-MAST-true,MCTS-UCB1GRAVE-1.41421356237-NST-false,1,1,1,1,0,1,...,0,1,298.070007,18877.169922,Goal: Connect all three edge colors with a sin...,"(game ""00'Y'"" (players 2) (equipment { (board ...",5,0,10,-0.333252


In [6]:
y = train[['utility_agent1']]
X = train.drop('utility_agent1', axis=1)

In [7]:
# Number of unique values in each column
unique_counts = X.nunique()
print(unique_counts)
# Drop columns with only one unique value
X = X.drop(columns=unique_counts[unique_counts == 1].index)
test = test.drop(columns=unique_counts[unique_counts == 1].index)

Id                   233234
GameRulesetName        1377
agent1                   72
agent2                   72
Properties                1
                      ...  
EnglishRules           1328
LudRules               1373
num_wins_agent1          33
num_draws_agent1         31
num_losses_agent1        34
Length: 795, dtype: int64


In [8]:
# Split 'agent1' into 5 parts and assign to new columns
X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)

# Split 'agent2' into 5 parts and assign to new columns
X[['agent2_part1', 'agent2_part2', 'agent2_part3', 'agent2_part4', 'agent2_part5']] = X['agent2'].str.split('-', expand=True)

test[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = test['agent1'].str.split('-', expand=True)

# Split 'agent2' into 5 parts and assign to new columns
test[['agent2_part1', 'agent2_part2', 'agent2_part3', 'agent2_part4', 'agent2_part5']] = test['agent2'].str.split('-', expand=True)

X = X.drop(['Id','agent1','agent2'], axis=1)

test = test.drop(['Id','agent1','agent2'], axis=1)

X.head()




  X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)
  X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)
  X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)
  X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)
  X[['agent1_part1', 'agent1_part2', 'agent1_part3', 'agent1_part4', 'agent1_part5']] = X['agent1'].str.split('-', expand=True)
  X[['agent2_part1', 'agent2_part2', 'agent2_part3', 'agent2_part4', 'agent2_part5']] = X['agent2'].str.split('-', expand=True)
  X[['agent2_part1', 'agent2_part2', 'agent2_part3', 'agent2_part4', 'agent2_part5']] = X['agent2'].str.split('-', expand=True)
  X[['agent2_part1', 'agent2_part2', 'agent2_part3', 'agent2_part4', 'agent2_part5']] = X['agent2'].str.

Unnamed: 0,GameRulesetName,Stochastic,Asymmetric,AsymmetricForces,AsymmetricPiecesType,PlayersWithDirections,Cooperation,Team,Shape,SquareShape,...,agent1_part1,agent1_part2,agent1_part3,agent1_part4,agent1_part5,agent2_part1,agent2_part2,agent2_part3,agent2_part4,agent2_part5
0,00Y,0,0,0,0,0,0,0,1,0,...,MCTS,ProgressiveHistory,0.1,MAST,False,MCTS,ProgressiveHistory,0.6,Random200,False
1,00Y,0,0,0,0,0,0,0,1,0,...,MCTS,ProgressiveHistory,0.1,MAST,False,MCTS,UCB1GRAVE,0.6,NST,True
2,00Y,0,0,0,0,0,0,0,1,0,...,MCTS,ProgressiveHistory,0.1,MAST,True,MCTS,UCB1,0.1,NST,False
3,00Y,0,0,0,0,0,0,0,1,0,...,MCTS,ProgressiveHistory,0.1,MAST,True,MCTS,UCB1,0.6,NST,False
4,00Y,0,0,0,0,0,0,0,1,0,...,MCTS,ProgressiveHistory,0.1,MAST,True,MCTS,UCB1GRAVE,1.41421356237,NST,False


In [9]:
# Select only categorical features and the target
categorical_features = X.select_dtypes(include=['object', 'category']).columns

num_feat = X.select_dtypes('number').columns
feature_names = X.columns

In [10]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
cat_encoder = OrdinalEncoder()
std = StandardScaler()
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('cat', cat_encoder, categorical_features),('std', StandardScaler(), num_feat)])

X = ct.fit_transform(X)


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
import xgboost as xgb
xgb = xgb.XGBRegressor(random_state=42)
xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], eval_metric='rmse')



[0]	validation_0-rmse:0.43793
[1]	validation_0-rmse:0.30775
[2]	validation_0-rmse:0.21633
[3]	validation_0-rmse:0.15183
[4]	validation_0-rmse:0.10663
[5]	validation_0-rmse:0.07512
[6]	validation_0-rmse:0.05307
[7]	validation_0-rmse:0.03748
[8]	validation_0-rmse:0.02666
[9]	validation_0-rmse:0.01912
[10]	validation_0-rmse:0.01384
[11]	validation_0-rmse:0.01021
[12]	validation_0-rmse:0.00783
[13]	validation_0-rmse:0.00611
[14]	validation_0-rmse:0.00502
[15]	validation_0-rmse:0.00422
[16]	validation_0-rmse:0.00373
[17]	validation_0-rmse:0.00339
[18]	validation_0-rmse:0.00318
[19]	validation_0-rmse:0.00294
[20]	validation_0-rmse:0.00277
[21]	validation_0-rmse:0.00270
[22]	validation_0-rmse:0.00251
[23]	validation_0-rmse:0.00244
[24]	validation_0-rmse:0.00241
[25]	validation_0-rmse:0.00233
[26]	validation_0-rmse:0.00230
[27]	validation_0-rmse:0.00226
[28]	validation_0-rmse:0.00223
[29]	validation_0-rmse:0.00221
[30]	validation_0-rmse:0.00219
[31]	validation_0-rmse:0.00218
[32]	validation_0-

In [13]:
xgb.score(X_test, y_test)

0.9999904171341677

In [14]:
for col, val in sorted(zip(feature_names, xgb.feature_importances_), key=lambda x: x[1],reverse=True)[:10]:
    print(f'{col:10}{val:10.3f}')

agent2_part3     0.630
agent2_part5     0.368
agent2_part4     0.001
NumConcaveCorners     0.000
MoveDistanceAverage     0.000
NumAdjacentDirections     0.000
Drawishness     0.000
SemiRegularTiling     0.000
ScoreDifferenceChangeAverage     0.000
LesserThan     0.000


In [15]:
import lightgbm as lgb
lgr = lgb.LGBMRegressor(random_state=42)
lgr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.116985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 186587, number of used features: 602
[LightGBM] [Info] Start training from score 0.043666


In [16]:
lgr.score(X_test, y_test)

0.999989111190584

In [17]:
for col, val in sorted(zip(feature_names, lgr.feature_importances_), key=lambda x: x[1],reverse=True)[:100]:
    print(f'{col:10}{val:10.3f}')

agent2_part3  1203.000
agent2_part5  1052.000
agent2_part4   654.000
Stochastic    11.000
BoardSitesOccupiedMaxIncrease     6.000
BoardSitesOccupiedChangeLineBestFit     4.000
MovesNonDecision     3.000
Timeouts       3.000
DecisionFactorChangeNumTimes     3.000
PieceNumberMedian     3.000
Cooperation     2.000
ProposeDecision     2.000
LeapDecisionToEnemyFrequency     2.000
HopEffect      2.000
IsPieceAt      2.000
BoardSitesOccupiedChangeSign     2.000
DecisionFactorMedian     2.000
DecisionFactorVariance     2.000
MoveDistanceVariance     2.000
ScoreDifferenceMaximum     2.000
ScoreDifferenceChangeAverage     2.000
ScoreDifferenceChangeSign     2.000
ScoreDifferenceMaxIncrease     2.000
agent2_part2     2.000
GameRulesetName     1.000
PlayersWithDirections     1.000
DiamondShape     1.000
SquarePyramidalShape     1.000
MancalaThreeRows     1.000
NumCells       1.000
NumLeftSites     1.000
HopDecisionFriendToEnemyFrequency     1.000
SwapPlayersEffect     1.000
InterveneCaptureFrequen

In [18]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with low variance
selector = VarianceThreshold(threshold=0.01)
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

In [19]:
lgr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.863919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18557
[LightGBM] [Info] Number of data points in the train set: 186587, number of used features: 602
[LightGBM] [Info] Start training from score 0.043666


In [20]:
lgr.score(X_test, y_test)

0.999989111190584