In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
# Load the data
train_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [None]:
# Check the column names
print("Train Data Columns:", train_df.columns)
print("\nTest Data Columns:", test_df.columns)

In [None]:
# Use a smaller subset of data
train_df = train_df.head(2000)
test_df = test_df.head(2000)

In [None]:
# Combine the winner columns into a single target column
train_df['winner'] = np.argmax(train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values, axis=1)

In [None]:
# Combine all text data for tokenization
all_text = pd.concat([train_df['prompt'], train_df['response_a'], train_df['response_b'],
                      test_df['prompt'], test_df['response_a'], test_df['response_b']])

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
def tokenize_and_pad(text_series, tokenizer, max_len):
    sequences = tokenizer.texts_to_sequences(text_series)
    padded_sequences = pad_sequences(sequences,maxlen=max_len)
    return padded_sequences

In [None]:
# Define maximum sequence length
max_len = 100

In [None]:
# Tokenize and pad the training data
x_prompt = tokenize_and_pad(train_df['prompt'],tokenizer, max_len)
x_response_a = tokenize_and_pad(train_df['response_a'],tokenizer, max_len)
x_response_b = tokenize_and_pad(train_df['response_b'],tokenizer, max_len)
x_train = np.concatenate([x_prompt,x_response_a,x_response_b],axis=1)

In [None]:
# Encode the target variable
y_train = to_categorical(train_df['winner'])

In [None]:
# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Build the Keras model
model = Sequential()
model.add(Embedding(vocab_size,128))
model.add(LSTM(64,return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(3,activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

In [None]:
# Create a figure and a set of subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

# Plot training & validation loss values
ax1.plot(history.history['loss'], label='Train')
ax1.plot(history.history['val_loss'], label='Validation')
ax1.set_title('Model Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend(loc='upper right')

# Plot training & validation accuracy values
ax2.plot(history.history['accuracy'], label='Train')
ax2.plot(history.history['val_accuracy'], label='Validation')
ax2.set_title('Model Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend(loc='upper left')

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
# Prepare the test data
x_test_prompt = tokenize_and_pad(test_df['prompt'],tokenizer, max_len)
x_test_response_a = tokenize_and_pad(test_df['response_a'],tokenizer, max_len)
x_test_response_b = tokenize_and_pad(test_df['response_b'],tokenizer, max_len)
x_test = np.concatenate([x_test_prompt,x_test_response_a,x_test_response_b],axis=1)

In [None]:
# Predict on the test data
y_test_pred = model.predict(x_test)
y_test_pred_labels = np.argmax(y_test_pred,axis=1)

In [None]:
# Convert predictions to binary columns
test_df['winner_model_a'] = (y_test_pred_labels == 0).astype(float)
test_df['winner_model_b'] = (y_test_pred_labels == 1).astype(float)
test_df['winner_tie'] = (y_test_pred_labels == 2).astype(float)

In [None]:
# Save the predictions to submission.csv
submission_df = test_df[['id','winner_model_a','winner_model_b','winner_tie']]
submission_df.to_csv('submission.csv',index=False)