# Libraries

In [None]:
!pip install /kaggle/input/ftfy-dependeces/ftfy-6.2.0-py3-none-any.whl

In [None]:
import pandas as pd
# from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from ftfy import fix_encoding
import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
from torch.utils.data import Dataset
import torch
import json
import optuna
import numpy as np

from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
stop_words = pd.read_csv("/kaggle/input/nltk-english-stopwords/nltk_eng_stopwords.csv")["list_of_stopwords"].tolist()

# Load Data

In [None]:
train_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
test_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
sample_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/sample_submission.csv")

In [None]:
# if test_df.shape[0] < 10:
#     train_df = train_df[:100]

In [None]:
def get_exploded(df: pd.DataFrame) -> pd.DataFrame:
    tmp = df.copy()
    tmp["prompt"] = tmp["prompt"].progress_apply(lambda x: json.loads(fix_encoding(x)))
    tmp["response_a"] = tmp["response_a"].progress_apply(lambda x: json.loads(fix_encoding(x)))
    tmp["response_b"] = tmp["response_b"].progress_apply(lambda x: json.loads(fix_encoding(x)))

    tmp = tmp.explode(['prompt', 'response_a', 'response_b'])
    return tmp

In [None]:
tmp_train = get_exploded(train_df)
tmp_test = get_exploded(test_df)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, col):
        self.col = col
        self.df = df.copy()
        
        self.df["prompt"] = self.df["prompt"].progress_apply(self.fix_encode)
        self.df[col] = self.df[col].progress_apply(self.fix_encode)
        
        self.df = self.df.explode(['prompt', col])
    
    def fix_encode(self, x):
        return json.loads(fix_encoding(x))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        QA_input = {}
        QA_input['question'] = str(self.df.iloc[i]["prompt"])
        QA_input['context'] = str(self.df.iloc[i][self.col])
        
        if not QA_input['question']:
#             print(True)
            QA_input['question'] = 'empty_text' * 10
        if not QA_input['context']:
#             print(True)
            QA_input['context'] = 'empty_text' * 10
        
        QA_input['question'] = QA_input['question'][:510]
        QA_input['context'] = QA_input['context'][:510]

        return QA_input


dataset_a = MyDataset(train_df, col='response_a')
dataset_b = MyDataset(train_df, col='response_b')

dataset_a_test = MyDataset(test_df, col='response_a')
dataset_b_test = MyDataset(test_df, col='response_b')

In [None]:
len(dataset_a)

In [None]:
outs_dict = {'dataset_a': [], 'dataset_b': []}
outs_dict_test = {'dataset_a': [], 'dataset_b': []}

# Models

## Generate Score

In [None]:
model_list = [
    "deepset/roberta-base-squad2",
    "deepset/deberta-v3-base-squad2",
    "distilbert/distilbert-base-cased-distilled-squad"
#     "Palak/microsoft_deberta-large_squad"
#     'distilbert/distilbert-base-cased-distilled-squad',
#     'deepset/bert-large-uncased-whole-word-masking-squad2'
]

model_list_kaggle = model_list.copy()
for i, model_name in enumerate(model_list_kaggle):
    model_list_kaggle[i] = '/kaggle/input/deberta-v3-base/' + model_name
model_list_kaggle

In [None]:
nlp_list = []

for model_name in model_list_kaggle:
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True)
    
    nlp = pipeline('question-answering', model=model, tokenizer=tokenizer, device='cuda',torch_dtype=torch.float16)
    
    nlp_list.append(nlp)

In [None]:
for model_name, pipeline in tqdm(zip(model_list, nlp_list), total=len(nlp_list)):
    pipeline.save_pretrained(model_name)

In [None]:
def get_outs(model_list):
    outs_dict = {}
    for model_name in model_list:
        outs_dict[f'{model_name}-a'] = []
        outs_dict[f'{model_name}-b'] = []
    return outs_dict

In [None]:
outs_train = get_outs(model_list)
outs_test = get_outs(model_list)

In [None]:
# tokenizer_kwargs = {"truncation": True, 'max_length': 512, 'padding': True}

In [None]:
def get_score(nlp, dataset) -> list:
    scores = []
    for sample in tqdm(dataset, total=len(dataset)):
        try:
            out = nlp(sample, doc_stride=47)
        except:
            print('omom')
            out = {}
            out['score'] = 0
        scores.append(out['score'])
#     for out in tqdm(nlp(dataset, 
#         handle_impossible_answer=True,
#         max_seq_len=384,
#         ), total=len(dataset)):
#         scores.append(out['score'])
    return scores

In [None]:
for model_name, nlp in tqdm(zip(model_list, nlp_list), total=len(model_list)):
    outs_train[f'{model_name}-a'] = get_score(nlp, dataset_a)
    outs_train[f'{model_name}-b'] = get_score(nlp, dataset_b)
    
    outs_test[f'{model_name}-a'] = get_score(nlp, dataset_a_test)
    outs_test[f'{model_name}-b'] = get_score(nlp, dataset_b_test)
    
    del nlp

In [None]:
outs_train = pd.DataFrame(outs_train)
outs_test = pd.DataFrame(outs_test)

In [None]:
outs_train.head()

In [None]:
df_bert = pd.concat([outs_train, tmp_train[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']].reset_index()], axis=1).drop('index', axis=1)
df_bert_test = pd.concat([outs_test, tmp_test['id'].reset_index()], axis=1).drop('index', axis=1)
df_bert.head()

In [None]:
model_dataset = outs_train.columns
model_dataset

In [None]:
df_bert_train = df_bert.groupby('id').mean()

df_bert_test = df_bert_test.groupby('id').mean()

In [None]:
df_bert_train.to_csv('df_bert_train.csv', index=False)
df_bert_test.to_csv('df_bert_test.csv', index=False)

In [None]:
df_bert_train.head()

## Collect in one column

In [None]:
df_bert_train['winner'] = df_bert_train[['winner_model_a', 'winner_model_b', 'winner_tie']].apply(np.argmax, axis=1)
df_bert_train.head(2)

In [None]:
df_bert_train.drop(columns=['winner_model_a', 'winner_model_b', 'winner_tie'], inplace=True)

In [None]:
df_bert_train.head(2)

In [None]:
df_bert_train.to_csv('deberts.csv', index=False)

## Train model

In [None]:
model = CatBoostClassifier(verbose=False, random_state=2024)

In [None]:
target = 'winner'

In [None]:
df_bert_train.head()

In [None]:
X_train = df_bert_train.drop(columns=target)
y_train = df_bert_train[target]

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict_proba(df_bert_test)
y_pred

# Submit

In [None]:
sample_df[['winner_model_a', 'winner_model_b', 'winner_tie']] = y_pred

In [None]:
sample_df

In [None]:
sample_df.to_csv('submission.csv', index=False)