In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')

In [None]:
test_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [None]:
train_data.head()

In [None]:
train_data_processed = train_data.drop('id', axis = 1)
test_data_processed = test_data.drop('id', axis = 1)

In [None]:
train_data_processed = train_data_processed.drop(['model_a', 'model_b'], axis = 1)

In [None]:
train_data_processed['winner'] = train_data_processed[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).apply(lambda x: {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}[x])
train_data_processed.drop(columns=['winner_model_a', 'winner_model_b', 'winner_tie'], inplace=True)

In [None]:
train_data_processed

In [None]:
!pip install transformers
!pip install torch

In [None]:
import torch
from tqdm import tqdm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# tokenizer.save_pretrained('./bert-base-uncased')
# model.save_pretrained('./bert-base-uncased')

In [None]:
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased/pytorch/uncased/1')
model = BertModel.from_pretrained('/kaggle/input/bert-base-uncased/pytorch/uncased/1')
model.to(device)

In [None]:
def get_bert_embeddings_batch(text_list, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU if available
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move embeddings back to CPU
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [None]:
train_prompt_embeddings = get_bert_embeddings_batch(train_data_processed['prompt'].tolist())
train_response_a_embeddings = get_bert_embeddings_batch(train_data_processed['response_a'].tolist())
train_response_b_embeddings = get_bert_embeddings_batch(train_data_processed['response_b'].tolist())

In [None]:
test_prompt_embeddings = get_bert_embeddings_batch(test_data_processed['prompt'].tolist())
test_response_a_embeddings = get_bert_embeddings_batch(test_data_processed['response_a'].tolist())
test_response_b_embeddings = get_bert_embeddings_batch(test_data_processed['response_b'].tolist())

In [None]:
train_embeddings = np.hstack([train_prompt_embeddings, train_response_a_embeddings, train_response_b_embeddings])
test_embeddings = np.hstack([test_prompt_embeddings, test_response_a_embeddings, test_response_b_embeddings])

In [None]:
train_data_processed['prompt_embedding'] = list(train_embeddings[:, :768])
train_data_processed['response_a_embedding'] = list(train_embeddings[:, 768:1536])
train_data_processed['response_b_embedding'] = list(train_embeddings[:, 1536:2304])

test_data_processed['prompt_embedding'] = list(test_embeddings[:, :768])
test_data_processed['response_a_embedding'] = list(test_embeddings[:, 768:1536])
test_data_processed['response_b_embedding'] = list(test_embeddings[:, 1536:2304])

In [None]:
train_data_processed

In [None]:
X = train_embeddings
y = train_data_processed['winner']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
pip install catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    iterations = 1460,
    learning_rate = 0.01,
    depth = 7,
    loss_function = 'MultiClass',
    eval_metric = 'Accuracy',
    random_seed = 0,
    task_type = 'GPU',
    verbose = 100
)

In [None]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

In [None]:
val_preds = model.predict(X_val)
val_preds_class = val_preds.argmax(axis=1)

accuracy = accuracy_score(y_val, val_preds_class)
print(f'Validation Accuracy: {accuracy:.4f}')

In [None]:
X_test = test_embeddings

In [None]:
test_preds_prob = model.predict_proba(X_test)

In [None]:
submission = pd.DataFrame({
    'id': test_data['id'],
    'prob_winner_model_a': test_preds_prob[:, 0],
    'prob_winner_model_b': test_preds_prob[:, 1],
    'prob_winner_tie': test_preds_prob[:, 2]
})

submission.to_csv('submission.csv', index=False)