<span style="color:#E0BFB8; font-size: 50px; font-weight: bold;">Imports</span>

In [2]:
# System operations
import gc
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Data preprocessing
import numpy as np
import polars as pl
import pandas as pd
from pathlib import Path

In [4]:
# Evaluation API
import kaggle_evaluation.mcts_inference_server

ModuleNotFoundError: No module named 'kaggle_evaluation'

In [5]:
# Show all available columns
pd.options.display.max_columns = None

In [6]:
# Exploratory data analysis
import plotly.colors as pc
import plotly.express as px
import plotly.graph_objects as go

In [7]:
# Model development
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse

<span style="color:#E0BFB8; font-size: 50px; font-weight: bold;">Configuration</span>

In [8]:
class CFG:
    
    # Paths to competition data
    train_path = Path('../data/train.csv')
    test_path = Path('../data/test.csv')
    subm_path = Path('../data/sample_submission.csv')
    
    # Feature engineering (FE) arguments
    batch_size = 16384
    low_memory = True
    
    # Color for EDA and MD
    color = '#E0BFB8'
    
    # Model development (MD) arguments
    early_stop = 50
    n_splits = 5
    
    # LightGBM parameters
    lgb_p = {
        'objective': 'regression',
        'num_iterations': 400,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 0.8,
        'num_leaves': 64,
        'metric': 'rmse',
        'device': 'cpu',
        'max_depth': 4,
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }

<span style="color:#E0BFB8; font-size: 50px; font-weight: bold;">Feature Engineering</span>

In [10]:
class FE:
    
    def __init__(self, batch_size, low_memory):
        self.batch_size = batch_size # Number of lines to read into the buffer at once
        self.low_memory = low_memory # Reduce memory pressure
        
    def clean_data(self, df):
        
        # Define columns to drop
        drop_cols = [
            'Id',
            'num_wins_agent1',
            'num_draws_agent1',
            'num_losses_agent1',
        ]
        
        # Drop columns
        for col in drop_cols:
            if col in df.columns:
                df = df.drop(col)
        
        return df
    
    def set_datatypes(self, df):
        
        # Define categorical columns
        cat_cols = [
            'GameRulesetName',
            'agent1',
            'agent2', 
            'Behaviour', 
            'StateRepetition', 
            'Duration',
            'Complexity',
            'BoardCoverage',
            'GameOutcome',
            'StateEvaluation',
            'Clarity',
            'Decisiveness',
            'Drama',
            'MoveEvaluation',
            'StateEvaluationDifference',
            'BoardSitesOccupied',
            'BranchingFactor',
            'DecisionFactor',
            'MoveDistance',
            'PieceNumber',
            'ScoreDifference',
            'EnglishRules',
            'LudRules'
        ]
        
        # Define numeric columns
        num_cols = [col for col in df.columns if col not in cat_cols]
        
        # Set datatypes for categorical columns
        df = df.with_columns([pl.col(col).cast(pl.Categorical) for col in cat_cols if col in df.columns])            
        
        # Set datatypes for numeric columns
        df = df.with_columns([pl.col(col).cast(pl.Float32) for col in num_cols if col in df.columns])
        
        return df
    
    def extract_cat_cols(self, df):
        
        # Define a list of categorical columns
        cat_cols = []
        
        # Find categorical columns
        for col in df.columns:
            if df[col].dtype == pl.Categorical:
                cat_cols.append(col)
        
        return cat_cols
    
    def extract_cat_cols(self, df):
        
        # Define a list of categorical columns
        cat_cols = []
        
        # Find categorical columns
        for col in df.columns:
            if df[col].dtype == pl.Categorical:
                cat_cols.append(col)
        
        return cat_cols
    
    def display_info(self, df, for_eda):

        # Display information for EDA
        if for_eda:

            # Display the shape of the DataFrame
            print(f'Shape: {df.shape}')

            # Display the memory usage of the DataFrame
            mem = df.memory_usage().sum() / 1024**2
            print('Memory usage: {:.2f} MB\n'.format(mem))

            # Display first rows of the DataFrame
            display(df.head())

        # Display basic information for non-EDA processing
        else:

            # Display the shape of the DataFrame
            print(f'Shape: {df.shape}')

            # Display the memory usage of the DataFrame
            mem = df.estimated_size() / 1024**2
            print('Memory usage: {:.2f} MB\n'.format(mem))

    def process_data(self, path, for_eda=True): # Determines whether to convert to pandas for EDA or keep as polars for processing

        # Load data as polars DataFrame and drop the Id column
        df = pl.read_csv(path, low_memory=self.low_memory, batch_size=self.batch_size)

        # Drop redundant columns
        df = self.clean_data(df)

        # Set datatypes for each column
        df = self.set_datatypes(df)

        # Extract categorical columns
        cat_cols = self.extract_cat_cols(df)

        # Convert Polars to Pandas DataFrame
        if for_eda:
            df = df.to_pandas()

        # Show the shape and first few rows of the DataFrame
        self.display_info(df, for_eda)

        return df, cat_cols

In [11]:
# Initialize class for feature engineering
fe = FE(CFG.batch_size, CFG.low_memory)

In [12]:
# Load and process train data
train_data, _ = fe.process_data(CFG.train_path)

FileNotFoundError: No such file or directory (os error 2): /kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv

<span style="color:#E0BFB8; font-size: 50px; font-weight: bold;">Exploratory Data Analysis</span>

In [11]:
class EDA:
    
    def __init__(self, df, color):
        self.df = df  
        self.color = color  

    def template(self, fig, title):
        
        # Set plot background and layout to match the user's theme
        fig.update_layout(
            title=title,
            title_x=0.5, 
            plot_bgcolor='rgba(0,0,0,0)', 
            paper_bgcolor='rgba(0,0,0,0)',  
            font=dict(color='#7f7f7f'),
            margin=dict(l=90, r=90, t=90, b=90), 
            height=900  
        )
        
        return fig
    
    def target_distribution(self):
        
        # Calculate the distribution of the target variable (utility_agent1)
        target_distribution = self.df['utility_agent1'].value_counts().sort_index()

        # Create a histogram for the target distribution
        fig = px.histogram(
            self.df,
            x='utility_agent1',
            nbins=50,  # Granularity of the histogram
            title='Distribution of Agent 1 Utility',  
            color_discrete_sequence=[self.color]  
        )

        # Customize the histogram layout
        fig.update_layout(
            xaxis_title='Utility of Agent 1',
            yaxis_title='Count', 
            bargap=0.1  
        )

        # Customize hover text: round numbers to 3 decimal places, format large numbers with commas
        fig.update_traces(
            hovertemplate='Utility: %{x:.3f}<br>Count: %{y:,}'
        )

        # Apply the template to the histogram
        fig = self.template(fig, 'Distribution of Agent 1 Utility')

        # Display the histogram
        fig.show()

In [12]:
# Initialize class for Exploratory Data Analysis (EDA)
eda = EDA(train_data, CFG.color)

In [13]:
eda.target_distribution()

In [14]:
# Delete references to train data
del train_data
gc.collect()

760

<span style="color:#E0BFB8; font-size: 50px; font-weight: bold;">Model Development</span>

In [15]:
class MD:
    
    def __init__(self, 
                 early_stop, 
                 n_splits,
                 color,
                 lgb_p):
        
        self.early_stop = early_stop
        self.n_splits = n_splits
        self.color = color
        self.lgb_p = lgb_p
    
    def plot_cv(self, fold_scores, model_name):
        
        # Round the fold scores to 3 decimal places
        fold_scores = [round(score, 3) for score in fold_scores]
        mean_score = round(np.mean(fold_scores), 3)
        std_score = round(np.std(fold_scores), 3)

        # Create a new figure for plotting
        fig = go.Figure()

        # Add scatter plot for individual fold scores
        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'markers', 
            name = 'Fold Scores',
            marker = dict(size = 24, color=self.color, symbol='diamond'), # Diamond shape marker
            text = [f'{score:.3f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
            hoverlabel=dict(font=dict(size=16))  # Adjust the font size here
        ))

        # Add a horizontal line for the mean score
        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.3f}',
            line = dict(dash = 'dash', color = '#FFBF00'), # Colored Amber
            hoverinfo = 'none'
        ))

        # Update the layout of the plot
        fig.update_layout(
            title = f'{model_name} | Cross-Validation RMSE Scores | Variation of CV scores: {mean_score} ± {std_score}',
            xaxis_title = 'Fold',
            yaxis_title = 'RMSE Score',
            plot_bgcolor = 'rgba(0,0,0,0)',
            paper_bgcolor = 'rgba(0,0,0,0)',
            xaxis = dict(
                gridcolor = 'lightgray',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5]
            ),
            yaxis = dict(gridcolor = 'lightgray')
        )

        # Display the plot
        fig.show() 
        
    def train_lgb(self, data, cat_cols, title):
        
        # Convert data for pandas for training
        data = data.to_pandas()
        
        # Extract features columns and label
        X = data.drop(['utility_agent1'], axis=1)
        y = data['utility_agent1']
        
        # Convert categorical columns to category dtype
        for col in cat_cols:
            X[col] = X[col].astype('category')
        
        # Initialize cross-validation strategy
        cv = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        
        # Initialize lists to store models, CV scores, and OOF predictions
        models, scores = [], []
        oof_preds = np.zeros(len(X))
        
        # Perform cross-validation
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y)):
            
            # Split the data into training and validation sets for the current fold
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            # Train the model
            model = lgb.LGBMRegressor(**self.lgb_p)
            model.fit(X_train, y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_metric='rmse',
                      callbacks=[lgb.early_stopping(self.early_stop, verbose=0), 
                                 lgb.log_evaluation(0)])
            
            # Append the trained model to the list
            models.append(model)
            
            # Make predictions on the validation set
            oof_preds[valid_index] = model.predict(X_valid)
            
            # Calculate and store the RMSE score for the current fold
            score = mse(y_valid, oof_preds[valid_index], squared=False)
            scores.append(score)
        
        # Plot the cross-validation results
        self.plot_cv(scores, title)
        
        return models, oof_preds

    def infer_lgb(self, data, cat_cols, models):
        
        # Convert data for pandas for inference
        data = data.to_pandas()

        # Convert categorical columns to category dtype
        for col in cat_cols:
            data[col] = data[col].astype('category')

        # Return the averaged predictions of LightGBM models
        return np.mean([model.predict(data) for model in models], axis=0)

In [16]:
# Initialize class for model development
md = MD(CFG.early_stop,
        CFG.n_splits,
        CFG.color, 
        CFG.lgb_p)

<span style="color:#E0BFB8; font-size: 30px; font-weight: bold;">Define the trainer function</span>

In [17]:
def train_model():
    
    global cat_cols, lgb_models
    
    # Load and process train data - extract categorical columns
    train, cat_cols = fe.process_data(CFG.train_path, for_eda=False)
    
    # Train LightGBM models
    lgb_models, _ = md.train_lgb(train, cat_cols, 'LightGBM')

<span style="color:#E0BFB8; font-size: 30px; font-weight: bold;">Define the predict call</span>

In [18]:
# Initialize a counter to keep track of prediction calls
counter = 0

# Define the predict function for the API
def predict(test, submission):
    
    # Use the global counter variable
    global counter
    
    # If this is the first prediction call, train LightGBM models
    if counter == 0:
        
        # Train LightGBM models
        train_model()
        
    # Increment the counter for each prediction call to avoid re-training
    counter += 1
    
    # Drop redundant columns
    test = fe.clean_data(test)

    # Set datatypes for each column
    test = fe.set_datatypes(test)
    
    # Generate test predictions and assign them to the submission DataFrame
    return submission.with_columns(pl.Series('utility_agent1', md.infer_lgb(test, cat_cols, lgb_models)))

<span style="color:#E0BFB8; font-size: 30px; font-weight: bold;">Call the gateway server</span>

In [19]:
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'
        )
    )

Shape: (233234, 810)
Memory usage: 725.89 MB

