In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # Disable GPU

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

# Load the dataset
train_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test_data = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

# Combine prompts and responses for feature extraction
train_data['text_a'] = train_data['prompt'] + ' ' + train_data['response_a']
train_data['text_b'] = train_data['prompt'] + ' ' + train_data['response_b']

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_a = tfidf.fit_transform(train_data['text_a']).toarray()
X_b = tfidf.transform(train_data['text_b']).toarray()

# Combine the vectorized responses
X = np.hstack([X_a, X_b])

# Target variable
y = train_data[['winner_model_a', 'winner_model_b', 'winner_tie']].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Define the model using Functional API
input_layer = Input(shape=(X_train.shape[1],))
x = Dense(512, activation='relu')(input_layer)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output_layer = Dense(3, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
val_predictions = model.predict(X_val)
loss = log_loss(y_val, val_predictions)
print(f'Validation Log Loss: {loss}')

# Prepare the test data
test_data['text_a'] = test_data['prompt'] + ' ' + test_data['response_a']
test_data['text_b'] = test_data['prompt'] + ' ' + test_data['response_b']
X_test_a = tfidf.transform(test_data['text_a']).toarray()
X_test_b = tfidf.transform(test_data['text_b']).toarray()
X_test = np.hstack([X_test_a, X_test_b])
X_test = scaler.transform(X_test)

# Predict on the test set
test_predictions = model.predict(X_test)

# Create the submission file
submission = pd.DataFrame(test_data['id'])
submission['winner_model_a'] = test_predictions[:, 0]
submission['winner_model_b'] = test_predictions[:, 1]
submission['winner_tie'] = test_predictions[:, 2]
submission.to_csv('submission.csv', index=False)
