In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test=pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [None]:
import tensorflow as tf
# detect and init the TPU



## Import libraries

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import transformers 
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


In [None]:
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased')

In [None]:
from transformers import TFBertModel

# Load BERT model
bert_model = TFBertModel.from_pretrained('/kaggle/input/bert-base-uncased')


In [None]:
for i in train.index:
    if train.loc[i,'winner_model_a']==1:
        train.loc[i,'winner']=0
    elif train.loc[i,'winner_model_b']==1:
        train.loc[i,'winner']=1
    else :
        train.loc[i,'winner']=2

In [None]:
features=['prompt','response_a','response_b','winner']

In [None]:
train_data=train[features]

In [None]:
X_train,X_val=train_test_split(train_data,test_size=0.2,random_state=42)


In [None]:
# Tokenize function
def tokenize_function(df):
    prompt_encodings = tokenizer(
        df['prompt'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )
    response_a_encodings = tokenizer(
        df['response_a'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )
    response_b_encodings = tokenizer(
        df['response_b'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='tf'
    )
    return prompt_encodings, response_a_encodings, response_b_encodings 


train_prompt_encodings, train_response_a_encodings, train_response_b_encodings = tokenize_function(X_train)
val_prompt_encodings, val_response_a_encodings, val_response_b_encodings = tokenize_function(X_val)


In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical 

# Prepare input features and labels
train_labels = to_categorical(X_train['winner'].tolist(), num_classes=3)
val_labels = to_categorical(X_val['winner'].tolist(), num_classes=3)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids_prompt': train_prompt_encodings['input_ids'],
        'attention_mask_prompt': train_prompt_encodings['attention_mask'],
        'input_ids_response_a': train_response_a_encodings['input_ids'],
        'attention_mask_response_a': train_response_a_encodings['attention_mask'],
        'input_ids_response_b': train_response_b_encodings['input_ids'],
        'attention_mask_response_b': train_response_b_encodings['attention_mask'],
    },
    train_labels
)).shuffle(1000).batch(1)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids_prompt': val_prompt_encodings['input_ids'],
        'attention_mask_prompt': val_prompt_encodings['attention_mask'],
        'input_ids_response_a': val_response_a_encodings['input_ids'],
        'attention_mask_response_a': val_response_a_encodings['attention_mask'],
        'input_ids_response_b': val_response_b_encodings['input_ids'],
        'attention_mask_response_b': val_response_b_encodings['attention_mask'],
    },
    val_labels
)).batch(1)


In [None]:

# Define inputs
input_ids_prompt = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids_prompt")
attention_mask_prompt = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask_prompt")

input_ids_response_a = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids_response_a")
attention_mask_response_a = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask_response_a")

input_ids_response_b = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids_response_b")
attention_mask_response_b = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask_response_b")


In [None]:
import tensorflow as tf
from transformers import TFBertModel

class BertEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, bert_model_name='bert-base-uncased', **kwargs):
        super(BertEmbeddingLayer, self).__init__(**kwargs)
        self.bert = bert_model
        
    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]  # CLS token embedding

# Initialize the custom BERT layer
bert_layer = BertEmbeddingLayer()


In [None]:

prompt_embeddings = bert_layer([input_ids_prompt, attention_mask_prompt])
response_a_embeddings = bert_layer([input_ids_response_a, attention_mask_response_a])
response_b_embeddings = bert_layer([input_ids_response_b, attention_mask_response_b])

# Concatenate embeddings
combined_embeddings = tf.keras.layers.Concatenate()([prompt_embeddings, response_a_embeddings, response_b_embeddings])


In [None]:

dense_layer = tf.keras.layers.Dense(256, activation='relu')(combined_embeddings)
dropout_layer = tf.keras.layers.Dropout(0.2)(dense_layer)
output_layer = tf.keras.layers.Dense(3, activation='softmax')(dropout_layer)

# Build and compile the model
model = tf.keras.Model(inputs=[
    input_ids_prompt, attention_mask_prompt,
    input_ids_response_a, attention_mask_response_a,
    input_ids_response_b, attention_mask_response_b
], outputs=output_layer)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)


In [None]:
def encode_text(texts, max_length):
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )

In [None]:
test

In [None]:
max_length = 128  # Adjust according to your model's max sequence length
input_ids_prompt = encode_text(test['prompt'].tolist(), max_length)
input_ids_response_a = encode_text(test['response_a'].tolist(), max_length)
input_ids_response_b = encode_text(test['response_b'].tolist(), max_length)

In [None]:
predictions = model.predict({
    'input_ids_prompt': input_ids_prompt['input_ids'],
    'attention_mask_prompt': input_ids_prompt['attention_mask'],
    'input_ids_response_a': input_ids_response_a['input_ids'],
    'attention_mask_response_a': input_ids_response_a['attention_mask'],
    'input_ids_response_b': input_ids_response_b['input_ids'],
    'attention_mask_response_b': input_ids_response_b['attention_mask']
})

In [None]:
print("done till here")

In [None]:
results=pd.DataFrame({
    'ID':test['id'],
    'winner_model_a':predictions[0],
    'winner_model_b':predictions[1],
    'winner_tie':predictions[2],
    
})

In [None]:
results.to_csv('/kaggle/working/submission.csv', index=False)