In [None]:
# reduce 50% data

import pandas as pd

# Load the train.csv file
train_data_path = '/kaggle/input/lmsys-chatbot-arena/train.csv'  # Update with the correct path
train_data = pd.read_csv(train_data_path)

test_data_path = '/kaggle/input/lmsys-chatbot-arena/test.csv'
test_data = pd.read_csv(test_data_path)

# Randomly sample 10% of the data
sampled_train_data = train_data.sample(frac=0.5, random_state=42)

# Save the sampled data if needed
sampled_train_data_path = '/kaggle/working/sample_train.csv'
sampled_train_data.to_csv(sampled_train_data_path, index=False)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Combine prompts and responses for feature extraction
sampled_train_data['text_a'] = sampled_train_data['prompt'] + " " + sampled_train_data['response_a']
sampled_train_data['text_b'] = sampled_train_data['prompt'] + " " + sampled_train_data['response_b']
test_data['text_a'] = test_data['prompt'] + " " + test_data['response_a']
test_data['text_b'] = test_data['prompt'] + " " + test_data['response_b']

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([sampled_train_data['text_a'], sampled_train_data['text_b'], test_data['text_a'], test_data['text_b']]))

# Convert texts to sequences
X_train_a = tokenizer.texts_to_sequences(sampled_train_data['text_a'])
X_train_b = tokenizer.texts_to_sequences(sampled_train_data['text_b'])
X_test_a = tokenizer.texts_to_sequences(test_data['text_a'])
X_test_b = tokenizer.texts_to_sequences(test_data['text_b'])

# Pad sequences to ensure equal length
max_length = max(max(len(seq) for seq in X_train_a), max(len(seq) for seq in X_train_b))
X_train_a = pad_sequences(X_train_a, maxlen=max_length, padding='post')
X_train_b = pad_sequences(X_train_b, maxlen=max_length, padding='post')
X_test_a = pad_sequences(X_test_a, maxlen=max_length, padding='post')
X_test_b = pad_sequences(X_test_b, maxlen=max_length, padding='post')

# Extract targets
y_train_a = sampled_train_data['winner_model_a']
y_train_b = sampled_train_data['winner_model_b']

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
def create_lstm_model(input_length):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=input_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(128))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_length = X_train_a.shape[1]

# Train model for text_a
model_a = create_lstm_model(input_length)
model_a.fit(X_train_a, y_train_a, epochs=5, batch_size=64, validation_split=0.7)

In [None]:
# Train model for text_b
model_b = create_lstm_model(input_length)
model_b.fit(X_train_b, y_train_b, epochs=5, batch_size=64, validation_split=0.7)

In [None]:
import numpy as np

# Make predictions on the test set
test_pred_a = model_a.predict(X_test_a).flatten()
test_pred_b = model_b.predict(X_test_b).flatten()

# Calculate probabilities for tie (assuming uniform distribution for simplicity)
test_pred_tie = np.full(test_pred_a.shape, 1/3)

In [None]:
# Prepare the submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'winner_model_a': test_pred_a,
    'winner_model_b': test_pred_b,
    'winner_tie': test_pred_tie
})

# Save the submission file
submission_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission file saved to {submission_path}")