# <p style="background-color:#E3F6FC; font-family:'Caveat', cursive; color:#4A4E69; font-size:140%; text-align:center; border: 3px dashed #A0CED9; border-radius:25px; padding: 15px; box-shadow: 3px 3px 15px rgba(74, 78, 105, 0.4); font-weight: bold; letter-spacing: 1.5px; text-transform: uppercase;">WSDM GET STARTER</p>

In [None]:
%%time

import numpy as np
import polars as pl
import pandas as pd

from sklearn.base import clone
import optuna
import os

from tqdm import tqdm
import category_encoders as ce
from IPython.display import clear_output
from scipy.sparse import hstack

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import string

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping, LGBMClassifier
from lightgbm import early_stopping  
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import *
from sklearn.metrics import *

# <p style="background-color:#E3F6FC; font-family:'Caveat', cursive; color:#4A4E69; font-size:140%; text-align:center; border: 3px dashed #A0CED9; border-radius:25px; padding: 15px; box-shadow: 3px 3px 15px rgba(74, 78, 105, 0.4); font-weight: bold; letter-spacing: 1.5px; text-transform: uppercase;">Load Data</p>

In [None]:
%%time

train = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
test = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')
sample = pd.read_csv('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv')

train['id'] = train['id'].astype('category')
test['id'] = test['id'].astype('category')

train['winner'] = train['winner'].map({
    "model_a": 0,
     "model_b": 1
     })

cols_to_drops = ['model_a', 'model_b', 'language', 'scored']
train = train.drop(columns=cols_to_drops, errors='ignore')
test = test.drop(columns=cols_to_drops, errors='ignore')

# <p style="background-color:#E3F6FC; font-family:'Caveat', cursive; color:#4A4E69; font-size:140%; text-align:center; border: 3px dashed #A0CED9; border-radius:25px; padding: 15px; box-shadow: 3px 3px 15px rgba(74, 78, 105, 0.4); font-weight: bold; letter-spacing: 1.5px; text-transform: uppercase;">Preprocessing Text Columns</p>

In [None]:
%%time

# Import the set of English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Function to compute various text statistics for specified columns in a DataFrame
def text_stat(df, text_columns):
    """
    Compute a variety of text statistics for each text column in the dataset.
    
    Args:
    df (pd.DataFrame): The DataFrame to process.
    txt_col (list): List of text column names to compute statistics for.

    Returns:
    pd.DataFrame: DataFrame with added text statistic features.
    """
    # Loop through each text column in the provided list
    for col in tqdm(text_columns, desc="Processing text columns"):
        
        # Calculate the length of each text entry
        df[f'{col}_length'] = df[col].apply(len)
        
        # Calculate the word count for each text entry
        df[f'{col}_word_count'] = df[col].apply(lambda x: len(x.split()))
        
        # Calculate the total character count (excluding spaces) for all words in each entry
        df[f'{col}_char_count'] = df[col].apply(lambda x: sum([len(word) for word in x.split()]))
        
        # Calculate the average word length for each text entry
        df[f'{col}_avg_word_length'] = df[f'{col}_char_count'] / df[f'{col}_word_count']
        
        # Count the number of punctuation marks in each text entry
        df[f'{col}_punctuation_count'] = df[col].apply(lambda x: sum([1 for char in x if char in string.punctuation]))
        
        # Count the number of fully capitalized words in each text entry
        df[f'{col}_capitalized_count'] = df[col].apply(lambda x: sum([1 for word in x.split() if word.isupper()]))
        
        # Count the number of special characters (non-alphanumeric and non-space) in each entry
        df[f'{col}_special_char_count'] = df[col].apply(lambda x: sum([1 for char in x if not char.isalnum() and not char.isspace()]))
        
        # Count the number of stopwords in each text entry
        df[f'{col}_stopwords_count'] = df[col].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))
        
        # Count the number of unique words in each text entry
        df[f'{col}_unique_word_count'] = df[col].apply(lambda x: len(set(x.split())))
        
        # Calculate the lexical diversity (unique words divided by total word count)
        df[f'{col}_lexical_diversity'] = df[f'{col}_unique_word_count'] / df[f'{col}_word_count']
        
        # Calculate the mean word length in each text entry
        df[f'{col}_word_length_mean'] = df[col].apply(lambda x: np.mean([len(word) for word in x.split()]))
        
        # Calculate the median word length in each text entry
        df[f'{col}_word_length_median'] = df[col].apply(lambda x: np.median([len(word) for word in x.split()]))
        
        # Calculate the maximum word length in each text entry
        df[f'{col}_word_length_max'] = df[col].apply(lambda x: max([len(word) for word in x.split()], default=0))
        
        # Calculate the minimum word length in each text entry
        df[f'{col}_word_length_min'] = df[col].apply(lambda x: min([len(word) for word in x.split()], default=0))
    
    return df

# List of text columns to process
text_columns = ['prompt', 'response_a', 'response_b']

# Compute text statistics for the train dataset
train = text_stat(train, text_columns)

# Compute text statistics for the test dataset
test = text_stat(test, text_columns)

In [None]:
%%time

# Function to process text columns using TF-IDF vectorization and add the features to the original dataset
def TF_IDF_Features_Text_columns(train, test, text_columns, max_features=3000, analyzer='char_wb'):
    """
    Process text columns in the train and test datasets using TF-IDF vectorization.
    
    Args:
    train (pd.DataFrame): Training dataset.
    test (pd.DataFrame): Test dataset.
    text_columns (list): List of text column names to process.
    max_features (int): Maximum number of features to extract for each column.
    analyzer (str): Tokenization mode for TF-IDF ('word', 'char', or 'char_wb').

    Returns:
    pd.DataFrame, pd.DataFrame: Updated train and test datasets with added TF-IDF features.
    """
    
    train_features = []  # List to store TF-IDF features for the train dataset
    test_features = []   # List to store TF-IDF features for the test dataset
    
    # Loop through each text column to process it
    for col in tqdm(text_columns, desc="Processing text columns", unit="col"):
        # Initialize TF-IDF vectorizer with specified parameters
        vectorizer = TfidfVectorizer(analyzer=analyzer, max_features=max_features)
        
        # Fit the vectorizer on the train column and transform train and test columns
        train_tfidf_col = vectorizer.fit_transform(train[col])
        test_tfidf_col = vectorizer.transform(test[col])
        
        # Convert the sparse TF-IDF matrices to DataFrames with meaningful column names
        train_tfidf_col = pd.DataFrame(
            train_tfidf_col.toarray(), 
            columns=[f"tfidf_{col}_{i}" for i in range(train_tfidf_col.shape[1])]
        )
        test_tfidf_col = pd.DataFrame(
            test_tfidf_col.toarray(), 
            columns=[f"tfidf_{col}_{i}" for i in range(test_tfidf_col.shape[1])]
        )
        
        # Append the processed TF-IDF DataFrames to their respective lists
        train_features.append(train_tfidf_col)
        test_features.append(test_tfidf_col)
    
    # Concatenate the original train dataset with the newly generated TF-IDF features
    train_with_tfidf = pd.concat([train, *train_features], axis=1)
    
    # Concatenate the original test dataset with the newly generated TF-IDF features
    test_with_tfidf = pd.concat([test, *test_features], axis=1)
    
    return train_with_tfidf, test_with_tfidf

# Specify the text columns to process
text_columns = ['prompt', 'response_a', 'response_b']

# Apply the TF-IDF feature engineering function to the train and test datasets
train, test = TF_IDF_Features_Text_columns(train, test, text_columns)

# Drop the original text columns from both datasets as they are now represented by TF-IDF features
train = train.drop(columns=text_columns, errors='ignore')
test = test.drop(columns=text_columns, errors='ignore')

# <p style="background-color:#E3F6FC; font-family:'Caveat', cursive; color:#4A4E69; font-size:140%; text-align:center; border: 3px dashed #A0CED9; border-radius:25px; padding: 15px; box-shadow: 3px 3px 15px rgba(74, 78, 105, 0.4); font-weight: bold; letter-spacing: 1.5px; text-transform: uppercase;">Modeling</p>

In [None]:
%%time

# Define constants for reproducibility and splitting
SEED = 42
n_splits = 5

# Separate features and target variable from the training dataset
X = train.drop(['winner'], axis=1)  # Features
y = train['winner']                 # Target variable

def TrainML(model, X_test_data):
    """
    Train a machine learning model using StratifiedKFold cross-validation.
    
    Args:
    model: The machine learning model to train.
    X_test_data (pd.DataFrame): Test dataset for generating predictions.

    Returns:
    mean_test_preds (np.array): Mean predictions on the test data across all folds.
    """
    n_splits = 5  # Number of folds for StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    # Lists to store accuracy scores and predictions for each fold
    train_accuracy_scores = []
    val_accuracy_scores = []
    test_preds_list = []
    trained_models = []  # To store trained models for potential later use

    # Iterate through each fold
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), 
                                                     desc="Training Folds", total=n_splits)):
        # Split data into training and validation sets for the current fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Define early stopping callback to prevent overfitting
        callbacks = [early_stopping(stopping_rounds=40, verbose=False)]

        # Train the model on the training set
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
        trained_models.append(model)  # Save the trained model

        # Predict on the training and validation sets
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Calculate accuracy scores for training and validation
        train_accuracy = accuracy_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)

        # Append accuracy scores to the respective lists
        train_accuracy_scores.append(train_accuracy)
        val_accuracy_scores.append(val_accuracy)

        # Predict probabilities on the test data and save them
        test_preds = model.predict_proba(X_test_data)[:, 1]  # Get probability for the positive class
        test_preds_list.append(test_preds)

        # Print fold-specific accuracy scores
        print(f"Fold {fold+1} - Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        clear_output(wait=True)  # Clear the output to keep the console clean

    # Calculate the mean predictions on the test data across all folds
    mean_test_preds = np.mean(test_preds_list, axis=0)

    # Print summary of scores
    print("\n--- Final Mean Scores ---")
    print(f"Mean Train Accuracy: {np.mean(train_accuracy_scores):.4f}")
    print(f"Mean Validation Accuracy: {np.mean(val_accuracy_scores):.4f}")

    # Create a DataFrame to summarize fold-wise accuracy scores
    results_df = pd.DataFrame({
        'Fold': np.arange(1, n_splits + 1),
        'Train Accuracy': train_accuracy_scores,
        'Validation Accuracy': val_accuracy_scores
    })

    # Display fold-wise scores
    print("\n=== Fold-wise Accuracy Scores ===")
    print(results_df)

    return mean_test_preds  # Return the mean test predictions

In [None]:
%%time

LightParams = {'n_estimators': 2860, 'learning_rate': 0.022544116997360492, 'max_depth': 11, 'num_leaves': 31, 'min_child_samples': 42, 'subsample': 0.8085392166316496,
 'colsample_bytree': 0.6281848449949525, 'lambda_l1': 4.02155452669029, 'lambda_l2': 0.14096175149815865, 'min_gain_to_split': 0.2960660809801552,'n_jobs':-1}

Light_Model = LGBMClassifier(**LightParams, verbose=-1, random_state=SEED)

mean_preds = TrainML(Light_Model, test)

# <p style="background-color:#E3F6FC; font-family:'Caveat', cursive; color:#4A4E69; font-size:140%; text-align:center; border: 3px dashed #A0CED9; border-radius:25px; padding: 15px; box-shadow: 3px 3px 15px rgba(74, 78, 105, 0.4); font-weight: bold; letter-spacing: 1.5px; text-transform: uppercase;">Submission</p>

In [None]:
%%time

sample['winner'] = np.round(mean_preds).astype('int')
sample['winner'] = sample['winner'].map({0: 'model_a', 1: 'model_b'})

sample.to_csv('submission.csv', index = False)
sample.head()