# Libraries

In [None]:
!pip install  /kaggle/input/ftfy-dependeces/ftfy-6.2.0-py3-none-any.whl
!pip install  /kaggle/input/textstat-dependencies/textstat-0.7.4-py3-none-any.whl

In [None]:
import pandas as pd
import textstat as ts
import json
from ftfy import fix_encoding
from catboost import CatBoostClassifier,Pool
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from tqdm.notebook import tqdm
tqdm.pandas()

# Data

In [None]:
train_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
test_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
sample_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/sample_submission.csv")

In [None]:
response_a = pd.read_csv("/kaggle/input/responses-textstat/response_a.csv")
response_b = pd.read_csv("/kaggle/input/responses-textstat/response_b.csv")

## Get true data

In [None]:
unused_columns = ['model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'text_standard']

In [None]:
textstat_train = response_a.drop(columns=unused_columns).merge(response_b.drop(columns=unused_columns), on='id')

In [None]:
textstat_train = textstat_train.groupby('id').mean()
textstat_train.to_csv('textstat.csv', index=False)

In [None]:
X = textstat_train.copy()

In [None]:
y = response_a[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']].groupby('id').agg('max').apply(pd.Series.argmax, axis=1)
y.head()

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2024)
clfs = []
scores = []
for train_index, test_index in tqdm(skf.split(X, y), total=5):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Специальный класс для ускорения обучения 
    train_dataset = Pool(data=X_train, label=y_train)
    eval_dataset = Pool(data=X_test, label=y_test)

    clf = CatBoostClassifier(
        depth=6,
        iterations=10000,
        learning_rate=0.06,
#         loss_function="MultiLogloss",  # MultiLogloss
        eval_metric = 'AUC', 
#         custom_metric=["Logloss"],  # 'AUC / Accuracy,
        
        # Главная фишка катбуста - работа с категориальными признаками
#         cat_features=cat_features,
        # ignored_features = ignored_features,
        
        # Регуляризация и ускорение
#         colsample_bylevel=0.4,
#         subsample=0.95,
        l2_leaf_reg=10,
        min_data_in_leaf=50,
        max_bin=70,
        random_strength=1,
        
        # Параметры скорения
        task_type="CPU",    
        thread_count=-1,
        bootstrap_type="Bernoulli", 
        
        # Важное!
        random_seed=2024,
#         auto_class_weights="SqrtBalanced",
        early_stopping_rounds=200)

    clfs.append(clf)

    clf.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=200,
        use_best_model=True,
        plot=False)

    scores.append(np.mean([v for k, v in clf.best_score_["validation"].items() if "Recall" in k], dtype="float16"))
    # scores.append(clf.best_score_['validation']['MultiClass'])
    # clf.save_model("../tmp_data/cool_catboost_model_{}_deep".format(n))

assert len(clfs) == 5

## Generate test

In [None]:
df_test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")

In [None]:
def get_exploded(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy()
    tmp["prompt"] = tmp["prompt"].progress_apply(lambda x: json.loads(fix_encoding(x)))
    tmp["response_a"] = tmp["response_a"].progress_apply(lambda x: json.loads(fix_encoding(x)))
    tmp["response_b"] = tmp["response_b"].progress_apply(lambda x: json.loads(fix_encoding(x)))

    tmp = tmp.explode(['prompt', 'response_a', 'response_b'])
    return tmp

In [None]:
def get_features(df, column):
    df = df.copy()
    df['flesch_reading_ease'] = df[column].progress_apply(lambda x: ts.flesch_reading_ease(str(x)))
    df['flesch_kincaid_grade'] = df[column].progress_apply(lambda x: ts.flesch_kincaid_grade(str(x)))
    df['smog_index'] = df[column].progress_apply(lambda x: ts.smog_index(str(x)))
    df['automated_readability_index'] = df[column].progress_apply(lambda x: ts.automated_readability_index(str(x)))
    df['dale_chall_readability_score'] = df[column].progress_apply(lambda x: ts.dale_chall_readability_score(str(x)))
    df['difficult_words'] = df[column].progress_apply(lambda x: ts.difficult_words(str(x)))
    df['linsear_write_formula'] = df[column].progress_apply(lambda x: ts.linsear_write_formula(str(x)))
    df['gunning_fog'] = df[column].progress_apply(lambda x: ts.gunning_fog(str(x)))
    df['text_standard'] = df[column].progress_apply(lambda x: ts.text_standard(str(x)))
    df['fernandez_huerta'] = df[column].progress_apply(lambda x: ts.fernandez_huerta(str(x)))
    df['szigriszt_pazos'] = df[column].progress_apply(lambda x: ts.szigriszt_pazos(str(x)))
    df['gutierrez_polini'] = df[column].progress_apply(lambda x: ts.gutierrez_polini(str(x)))
    df['crawford'] = df[column].progress_apply(lambda x: ts.crawford(str(x)))
#     df['gulpease_index'] = df[column].progress_apply(lambda x: ts.gulpease_index(str(x)))
#     df['osman'] = df[column].progress_apply(lambda x: ts.osman(str(x)))
    return df

In [None]:
test_expl = get_exploded(df_test)
test_a = get_features(test_expl, 'response_a')
test_b = get_features(test_expl, 'response_b')

In [None]:
unused_columns = ['prompt', 'response_a', 'response_b', 'text_standard']
textstat_test = test_a.drop(columns=unused_columns).merge(test_b.drop(columns=unused_columns), on='id')
test = textstat_test.groupby('id').mean()
test

In [None]:
y_pred = []
for clf in tqdm(clfs, total=len(clfs)):
    y_predict = clf.predict_proba(test)
    y_pred.append(y_predict)

In [None]:
y_pred = sum(y_pred) / len(y_pred)

# Submition

In [None]:
sample_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

In [None]:
sample_df[['winner_model_a', 'winner_model_b', 'winner_tie']] = y_pred

In [None]:
sample_df.head()

In [None]:
sample_df.to_csv('submission.csv', index=False)