## This notebook is a really simple idea of a possible baseline for the competition.

#### The solution highlights how some particular aspects of the prompt and the responses can help getting better result than the always choosing model_a/model_b strategy.

#### The notebook can be worked on and improved, as well as the model that could be tuned and optimized, and additional models could be tested.

#### I want this notebook to be an easy starting point for those that want to use feature of the text for their modelling.

#### Feel free to comment your opinions or to fork the notebook and try out additional features!

# Imports

In [None]:
!pip install /kaggle/input/chardet/polyglot-16.7.4-py2.py3-none-any.whl
!pip install /kaggle/input/chardet/PyICU-2.14-cp310-cp310-linux_x86_64.whl
!pip install /kaggle/input/chardet/pycld2-0.41-cp310-cp310-linux_x86_64.whl

In [None]:
#!pip install PyICU
#!pip install pycld2

In [None]:
#!mv /root/.cache/pip/wheels/78/6e/76/17c73021179c06c29d9b108896b9248da0de4f2af93f63d405/PyICU-2.14-cp310-cp310-linux_x86_64.whl /kaggle/working/PyICU-2.14-cp310-cp310-linux_x86_64.whl
#!mv /root/.cache/pip/wheels/be/81/31/240c89c845e008a93d98542325270007de595bfd356eb0b06c/pycld2-0.41-cp310-cp310-linux_x86_64.whl /kaggle/working/pycld2-0.41-cp310-cp310-linux_x86_64.whl

In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping,log_evaluation,LGBMClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from polyglot.detect import Detector
from sklearn.preprocessing import LabelEncoder

# Data Loading

In [None]:
path="/kaggle/input/wsdm-cup-multilingual-chatbot-arena/"
train = pd.read_parquet(path+"train.parquet")
test = pd.read_parquet(path+"test.parquet")
sub = pd.read_csv(path+"sample_submission.csv")

In [None]:
def getLanguageAndConfidence(text):
    try:
        language=Detector(text,quiet=True).languages[0]
        return (language.name,language.confidence)
    except:
        return ("Unknown",0)

In [None]:
for prompt in train["prompt"]:
    print(getLanguageAndConfidence(prompt))
    break

In [None]:
train.head()

In [None]:
test.head()

# Data split and feature calculation

In [None]:
# 10% as validation split, this percentage could be changed
train,valid=train_test_split(train,test_size=0.2,stratify=train["winner"],random_state=161194)

# Train set can be inverted (and winner too) to get twice the data from the available training dataset
train_inv=train.copy()
train_inv["response_a"],train_inv["response_b"]=train_inv["response_b"],train_inv["response_a"]
train_inv["winner"]=train_inv["winner"].apply(lambda x: "model_a" if "b" in x else "model_b")

In [None]:
train.head()["prompt"].apply(getLanguageAndConfidence)

In [None]:
# Here I compute some features
def compute_feats(df):
    for col in ["response_a","response_b","prompt"]:
        # response lenght is a key factor when choosing between two responses
        df[f"{col}_len"]=df[f"{col}"].str.len()

        # Some characters counting features 
        df[f"{col}_spaces"]=df[f"{col}"].str.count("\s")
        df[f"{col}_punct"]=df[f"{col}"].str.count(",|\.|!")
        df[f"{col}_question_mark"]=df[f"{col}"].str.count("\?")
        df[f"{col}_quot"]=df[f"{col}"].str.count("'|\"")
        df[f"{col}_formatting_chars"]=df[f"{col}"].str.count("\*|\_")
        df[f"{col}_math_chars"]=df[f"{col}"].str.count("\-|\+|\=")
        df[f"{col}_curly_open"]=df[f"{col}"].str.count("\{")
        df[f"{col}_curly_close"]=df[f"{col}"].str.count("}")
        df[f"{col}_round_open"]=df[f"{col}"].str.count("\(")
        df[f"{col}_round_close"]=df[f"{col}"].str.count("\)")
        df[f"{col}_accent_chars"]=df[f"{col}"].str.count("è|ò|à|ù|é|ì")
        df[f"{col}_special_chars"]=df[f"{col}"].str.count("\W")
        df[f"{col}_digits"]=df[f"{col}"].str.count("\d")/df[f"{col}_len"]
        df[f"{col}_lower"]=df[f"{col}"].str.count("[a-z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_upper"]=df[f"{col}"].str.count("[A-Z]").astype("float32")/df[f"{col}_len"]
        df[f"{col}_chinese"]=df[f"{col}"].str.count(r'[\u4e00-\u9fff]+').astype("float32")/df[f"{col}_len"]
        df[f"{col}_tild"]=df[f"{col}"].str.count("~")>0

        # Feature that show how balanced are curly and round brackets
        df[f"{col}_round_balance"]=df[f"{col}_round_open"]-df[f"{col}_round_close"]
        df[f"{col}_curly_balance"]=df[f"{col}_curly_open"]-df[f"{col}_curly_close"]

        # Feature that tells if the string json is present somewhere (e.g. asking a json response or similar)
        df[f"{col}_json"]=df[f"{col}"].str.lower().str.count("json")
        df[f"{col}_yaml"]=df[f"{col}"].str.lower().str.count("yaml")

        temp=df[f"{col}"].apply(getLanguageAndConfidence)

        df[f"{col}_language"]=temp.str[0]
        df[f"{col}_confidence"]=temp.str[1]

        

    return df
train=compute_feats(train)
train_inv=compute_feats(train_inv)

train=pd.concat([train,train_inv])
valid=compute_feats(valid)
test=compute_feats(test)

# Ancient Baseline from notebook [WSDM: Ancient Baseline](https://www.kaggle.com/code/yekenot/wsdm-ancient-baseline) 
Please go and like the notebook

In [None]:
vectorizer_char = TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(1,2), max_features=100_000)
vectorizer_word = TfidfVectorizer(sublinear_tf=True, analyzer='word', min_df=3)
preprocessor = ColumnTransformer(
    transformers=[
        ('prompt_feats', FeatureUnion([
            ('prompt_char', vectorizer_char),
            ('prompt_word', vectorizer_word)
        ]), 'prompt'),
        ('response_a_feats', FeatureUnion([
            ('response_a_char', vectorizer_char),
            ('response_a_word', vectorizer_word)
        ]), 'response_a'),
        ('response_b_feats', FeatureUnion([
            ('response_b_char', vectorizer_char),
            ('response_b_word', vectorizer_word)
        ]), 'response_b')
    ]
)
train_feats = preprocessor.fit_transform(train[["response_a","response_b","prompt"]])
test_feats = preprocessor.transform(test[["response_a","response_b","prompt"]])
valid_feats = preprocessor.transform(valid[["response_a","response_b","prompt"]])

In [None]:
model = LogisticRegression(C=0.1, solver='liblinear', dual=True, random_state=42)
model.fit(train_feats, train.winner)

In [None]:
score_train=model.predict_proba(train_feats)[:,1]
score_valid=model.predict_proba(valid_feats)[:,1]
score_test=model.predict_proba(test_feats)[:,1]

In [None]:
train["score"]=score_train
valid["score"]=score_valid
test["score"]=score_test

# Prepare Data for training

In [None]:
train.columns

In [None]:
feats=list(train.columns)[8:]
train["winner"]=(train["winner"]=="model_a").astype("int")
valid["winner"]=(valid["winner"]=="model_a").astype("int")

In [None]:
for c in ["response_a_language","response_b_language","prompt_language"]:
    train[c]=train[c].astype("category")    
    valid[c]=valid[c].astype("category")    
    test[c]=test[c].astype("category")

In [None]:
X=train[feats]
y=train["winner"]

X_val=valid[feats]
y_val=valid["winner"]

# Model training

In [None]:
def get_callbacks():
    return [early_stopping(100),log_evaluation(1)]
model=LGBMClassifier(n_estimators=1000,learning_rate=0.2)

In [None]:
history=model.fit(X,y,eval_set=(X_val,y_val),eval_metric="binary_error",callbacks=get_callbacks())

# Prediction

In [None]:
X_test=test[feats]

In [None]:
test["winner"]=model.predict(X_test)

In [None]:
test["winner"]=test["winner"].apply(lambda x: "model_a" if x==1 else "model_b")

sub=test[["id","winner"]]

In [None]:
sub.head()

# Submission

In [None]:
sub.to_csv("submission.csv",index=False)