**Version 3**: Ancient Baseline (CV 0.6062 | LB 0.61).

**Version 4**: Adding some fancy Modern stuff to the Ancient Baseline (CV 0.6106 | LB 0.60).

**Version 6**: Same as Version 4, but replacing single logreg with ensemble logreg + ftrl.

In [None]:
import sys
import scipy
import warnings
from tqdm import tqdm
import numpy as np, pandas as pd
from sklearn.base import BaseEstimator
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append('/kaggle/input/libftrl-python')
from ftrl import FtrlProximal  # FTRL (Follow The Regularized Leader)

from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import models, trainers, normalizers, pre_tokenizers, Tokenizer

warnings.filterwarnings("ignore")

In [None]:
path = "/kaggle/input/wsdm-cup-multilingual-chatbot-arena/"
train = pd.read_parquet(path+"train.parquet")
test = pd.read_parquet(path+"test.parquet")

In [None]:
%%time
def tokenize_text(col: str):
    LOWERCASE = False
    VOCAB_SIZE = 30522

    # Creating Byte-Pair Encoding tokenizer
    raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

    # Adding normalization and pre_tokenizer
    raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] +
                                                    [normalizers.Lowercase()] if LOWERCASE else [])
    raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

    # Adding special tokens and creating trainer instance
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

    # Creating huggingface dataset object
    dataset = Dataset.from_pandas(train[[col]])
    def train_corp_iter():
        """
        A generator function for iterating over a dataset in chunks.
        """    
        for i in range(0, len(dataset), 1000):
            yield dataset[i : i + 1000][col]

    # Training from iterator, it's training on train set...
    raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

    # Tokenize train set with new tokenizer
    tokenized_texts_train = []
    for text in tqdm(train[col].tolist()):
        tokenized_texts_train.append(tokenizer.tokenize(text))

    # Tokenize test set with new tokenizer
    tokenized_texts_test = []
    for text in tqdm(test[col].tolist()):
        tokenized_texts_test.append(tokenizer.tokenize(text))

    del raw_tokenizer, trainer, dataset, tokenizer
    return tokenized_texts_train, tokenized_texts_test

tokenized_texts_train_prompt, tokenized_texts_test_prompt = tokenize_text(col='prompt')
tokenized_texts_train_response_a, tokenized_texts_test_response_a = tokenize_text(col='response_a')
tokenized_texts_train_response_b, tokenized_texts_test_response_b = tokenize_text(col='response_b')

In [None]:
%%time
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer.
    It returns the text as it is since we already tokenized it.
    """
    return text

def vectorize_text(tokenized_texts_train, tokenized_texts_test):
    # Fitting TfidfVectoizer on train set
    vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                 max_features=80_000,
                                 lowercase=False,
                                 sublinear_tf=True,
                                 tokenizer=dummy,
                                 preprocessor=dummy,
                                 token_pattern=None,
                                 )
    vectorizer.fit(tokenized_texts_train)
    # Getting vocab
    vocab = vectorizer.vocabulary_

    # Fitting vectorizer on train set but this time using vocabulary from train fit.
    vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                 lowercase=False,
                                 sublinear_tf=True,
                                 vocabulary=vocab,
                                 tokenizer=dummy,
                                 preprocessor=dummy,
                                 token_pattern=None,
                                 )
    train_feats = vectorizer.fit_transform(tokenized_texts_train)
    test_feats = vectorizer.transform(tokenized_texts_test)

    del vectorizer, vocab, tokenized_texts_train, tokenized_texts_test
    print("train_feats shape:",train_feats.shape)
    print("test_feats shape:",train_feats.shape)
    return train_feats, test_feats

train_feats_prompt, test_feats_prompt = vectorize_text(tokenized_texts_train_prompt, tokenized_texts_test_prompt)
train_feats_response_a, test_feats_response_a = vectorize_text(tokenized_texts_train_response_a, tokenized_texts_test_response_a)
train_feats_response_b, test_feats_response_b = vectorize_text(tokenized_texts_train_response_b, tokenized_texts_test_response_b)

train_feats_modern = scipy.sparse.hstack((train_feats_prompt, train_feats_response_a, train_feats_response_b))
test_feats_modern = scipy.sparse.hstack((test_feats_prompt, test_feats_response_a, test_feats_response_b))
print("\ntrain_feats_modern shape:",train_feats_modern.shape)
print("test_feats_modern shape:",test_feats_modern.shape)

In [None]:
%%time
vectorizer_char = TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(1, 2), max_features=100_000)
vectorizer_word = TfidfVectorizer(sublinear_tf=True, analyzer='word', min_df=3)
preprocessor = ColumnTransformer(
    transformers=[
        ('prompt_feats', FeatureUnion([
            ('prompt_char', vectorizer_char),
            ('prompt_word', vectorizer_word)
        ]), 'prompt'),
        ('response_a_feats', FeatureUnion([
            ('response_a_char', vectorizer_char),
            ('response_a_word', vectorizer_word)
        ]), 'response_a'),
        ('response_b_feats', FeatureUnion([
            ('response_b_char', vectorizer_char),
            ('response_b_word', vectorizer_word)
        ]), 'response_b')
    ]
)
train_feats_ancient = preprocessor.fit_transform(train)
test_feats_ancient = preprocessor.transform(test)
print("train_feats_ancient shape:",train_feats_ancient.shape)
print("test_feats_ancient shape:",test_feats_ancient.shape)

In [None]:
train_feats_combined = scipy.sparse.hstack((train_feats_modern, train_feats_ancient))
test_feats_combined = scipy.sparse.hstack((test_feats_modern, test_feats_ancient))
print("train_feats_combined shape:",train_feats_combined.shape)
print("test_feats_combined shape:",test_feats_combined.shape)

In [None]:
class Ftrl_sklearn(BaseEstimator):
    def __init__(self):
        self._estimator = FtrlProximal(alpha=0.01, beta=1.0, l1=75.0, l2=0.0, model_type='classification')
        self._estimator_type = "classifier"
        
    def fit(self, X, y):
        self._estimator.fit(X, y, num_passes=4)
        return self
        
    def predict(self, X):
        probas = self._estimator.predict(X)
        probas_array = np.array([np.clip(np.append(0, probas[i]), 0, None) for i in range(len(probas))])
        return probas_array.argmax(axis=1)

    def predict_proba(self, X):
        probas = self._estimator.predict(X)
        probas_array = np.array([np.clip(np.append(0, probas[i]), 0, None) for i in range(len(probas))])
        return probas_array

ftrl_model = Ftrl_sklearn()
log_model = LogisticRegression(C=0.1, solver='liblinear', dual=True, random_state=42)

In [None]:
%%time
encoder = LabelEncoder()
train['winner_encoded'] = encoder.fit_transform(train['winner'])

print('ftrl_model CV performing...')
accuracy_scores = cross_val_score(ftrl_model, train_feats_combined, train['winner_encoded'].to_numpy(), cv=3, scoring='accuracy')
print(f'Accuracy scores for each fold: {accuracy_scores}')
print(f'Mean accuracy: {accuracy_scores.mean():.4f}\n')

print('log_model CV performing...')
accuracy_scores = cross_val_score(log_model, train_feats_combined, train['winner_encoded'].to_numpy(), cv=3, scoring='accuracy')
print(f'Accuracy scores for each fold: {accuracy_scores}')
print(f'Mean accuracy: {accuracy_scores.mean():.4f}')

In [None]:
%%time
ensemble = VotingClassifier(
    estimators=[
        ('ftrl', ftrl_model),
        ('log', log_model)
    ],
    weights=[0.50, 0.50], voting='soft'
)
ensemble.fit(train_feats_combined, train['winner_encoded'].to_numpy())
print('ensemble CV performing...')
accuracy_scores = cross_val_score(ensemble, train_feats_combined, train['winner_encoded'].to_numpy(), cv=3, scoring='accuracy')
print(f'Accuracy scores for each fold: {accuracy_scores}')
print(f'Mean accuracy: {accuracy_scores.mean():.4f}')

In [None]:
test['winner_'] = ensemble.predict_proba(test_feats_combined).argmax(axis=1)
test['winner'] = encoder.inverse_transform(test['winner_'])
test[['id','winner']].to_csv('submission.csv', index=False)
test[['id','winner']]