# Notebook Versions

- Version 1 (11/23/2024)
   * Baseline modeling 1.0


- Version 2 (11/23/2024)
   * Fixing bug.
 

- Version 3 (11/23/2024)
   * Fixing bug. 
 
     
# Loading Libraries

In [None]:
%%time
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import gc
import difflib

import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm

import re

from functools import partial
from scipy.stats import kurtosis, skew, gmean, mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer, PowerTransformer, PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RepeatedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score, r2_score, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, RidgeCV
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC, LinearSVR

from category_encoders import TargetEncoder

import ydf
from ydf import RandomForestLearner

import xgboost as xgb

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

from sklearn.neural_network import MLPClassifier

import optuna

# Reading Data

In [None]:
%%time
train = pd.read_parquet('../input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
test = pd.read_parquet('../input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

print('The dimension of the train dataset is:', train.shape)
print('The dimension of the test dataset is:', test.shape)

In [None]:
train.head()

In [None]:
test

# Baseline Modeling 1.0

First, we engineer a few basic features as follows:

In [None]:
def fe(df, train=False):

    df['prompt_len'] = df['prompt'].str.len()
    df['prompt_words'] = df['prompt'].str.split().str.len()
    df['prompt_question'] = df['prompt'].str.contains('\?').astype(int)
    
    df['response_a_len'] = df['response_a'].str.len()
    df['response_b_len'] = df['response_b'].str.len()

    df['response_a_words'] = df['response_a'].str.split().str.len()
    df['response_b_words'] = df['response_b'].str.split().str.len()

    df['prompt_response_a_len_ratio'] = df['prompt_len'] / df['response_a_len']
    df['prompt_response_b_len_ratio'] = df['prompt_len'] / df['response_b_len']

    if train:
        df['winner'] = df['winner'].map({'model_a': 0, 'model_b': 1})

    return df

Next, we proceed to run a standard cross-validation experiment.

In [None]:
%%time
train = fe(train, train=True)
test = fe(test)

X = train[['prompt_len', 'prompt_words', 'prompt_question', 'response_a_len', 'response_b_len', 'response_a_words', 'response_b_words', 'prompt_response_a_len_ratio', 'prompt_response_b_len_ratio']]
y = train['winner']

test_cv = test[['prompt_len', 'prompt_words', 'prompt_question', 'response_a_len', 'response_b_len', 'response_a_words', 'response_b_words', 'prompt_response_a_len_ratio', 'prompt_response_b_len_ratio']]

SEED = 42
ydf.verbose(-1)
scores, ydf_test_preds = [], []
skf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=SEED)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    train_data = pd.concat([X_train, y_train], axis=1)
    test_data = pd.concat([X_test, y_test], axis=1)

    ydf_md = RandomForestLearner(label='winner', 
                                 num_threads=10, 
                                 num_trees=1000, 
                                 max_depth=15).train(train_data)
    ydf_pred = ydf_md.predict(test_data)

    score = accuracy_score(y_test, np.where(ydf_pred>0.5, 1, 0))
    print('Fold:', i, 'accuracy:', score)
    scores.append(score)

    ydf_test_preds.append(ydf_md.predict(test_cv))

print('The 10 fold average out-of-fold accuracy is:', np.mean(scores))

In [None]:
%%time
submission = pd.read_csv('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv')
submission['winner'] = np.where(np.mean(ydf_test_preds, axis=0)>0.5, 1, 0)
submission['winner'] = submission['winner'].map({0: 'model_a', 1: 'model_b'})
print(submission.head())

In [None]:
submission.to_csv('submission.csv', index=False)