In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
INPUT_DIR = "/kaggle/input/lmsys-chatbot-arena/"
train_df = pd.read_csv(f"{INPUT_DIR}/train.csv")


In [None]:
train_df

In [None]:
train_df.info()


In [None]:
MODEL_ID = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModel.from_pretrained(MODEL_ID)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

In [None]:
# Move model to GPU
model.to(device)


In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'


In [None]:
def tokenize_text(text):
    return tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)

In [None]:
train_df['prompt_tokens'] = train_df['prompt'].apply(tokenize_text)
train_df['response_a_tokens'] = train_df['response_a'].apply(tokenize_text)
train_df['response_b_tokens'] = train_df['response_b'].apply(tokenize_text)


In [None]:
train_df

In [None]:
def get_embeddings(text_tokens):
    with torch.no_grad():
        outputs = model(**text_tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()


In [None]:
train_df['prompt_embeddings'] = train_df['prompt_tokens'].apply(lambda x: get_embeddings(x))
train_df['response_a_embeddings'] = train_df['response_a_tokens'].apply(lambda x: get_embeddings(x))
train_df['response_b_embeddings'] = train_df['response_b_tokens'].apply(lambda x: get_embeddings(x))


In [None]:
X = pd.concat([pd.DataFrame(train_df['prompt_embeddings'].tolist()), 
               pd.DataFrame(train_df['response_a_embeddings'].tolist()), 
               pd.DataFrame(train_df['response_b_embeddings'].tolist())], axis=1)
y = train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values.argmax(axis=1)


In [None]:
#input 
X

In [None]:
#output
y

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')


In [None]:
test_df = pd.read_csv(f"{INPUT_DIR}/test.csv")

In [None]:
# Process Test Data
test_df['prompt_tokens'] = test_df['prompt'].apply(tokenize_text)
test_df['response_a_tokens'] = test_df['response_a'].apply(tokenize_text)
test_df['response_b_tokens'] = test_df['response_b'].apply(tokenize_text)

test_df['prompt_embeddings'] = test_df['prompt_tokens'].apply(get_embeddings)
test_df['response_a_embeddings'] = test_df['response_a_tokens'].apply(get_embeddings)
test_df['response_b_embeddings'] = test_df['response_b_tokens'].apply(get_embeddings)

# Prepare Test Features
X_test = pd.concat([pd.DataFrame(test_df['prompt_embeddings'].tolist()), 
                    pd.DataFrame(test_df['response_a_embeddings'].tolist()), 
                    pd.DataFrame(test_df['response_b_embeddings'].tolist())], axis=1)

In [None]:
X_test

In [None]:
y_test_pred_prob = clf.predict_proba(X_test)

submission_df = pd.DataFrame(y_test_pred_prob, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
submission_df.insert(0, 'id', test_df['id'])


In [None]:
submission_df

In [None]:
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created.")