In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import libraries

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from functools import partial
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModel
from transformers import DebertaV2Tokenizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from keras.preprocessing import sequence as sq

In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.head(5)

In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
test.head(5)

## Preprocessing

In [None]:
def combine_text(df):
    def process(input_str):
        stripped_str = input_str.strip('[]')
        sentences = [s.strip('"') for s in stripped_str.split('","')]
        return  ' '.join(sentences)

    # Flatten the arrays to single strings
    df['prompt'] = df['prompt'].apply(process)
    df['response_a'] = df['response_a'].apply(process)
    df['response_b'] = df['response_b'].apply(process)
    
    # Combine text data
#     df['combined_text'] = '[PROMPT] ' + df['prompt'] + ' [RESPONSE_A] ' + df['response_a'] + ' [RESPONSE_B] ' + df['response_b'] 

In [None]:
combine_text(train)
# print(train['combined_text'][69])

In [None]:
# Create labels
def create_label(df):
    def process(row):
        if row['winner_model_a'] == 1:
            return 0
        elif row['winner_model_b'] == 1:
            return 1
        elif row['winner_tie'] == 1:
            return 2
        
    df['label'] = df.apply(process, axis=1)

In [None]:
create_label(train)
print(train['label'][69])

In [None]:
print("train.shape", train.shape)
train.head()

## Tokenizer

In [None]:
max_length = 1024

In [None]:
# Tokenization using DebertaV2Tokenizer
model_name = "/kaggle/input/qwen2/transformers/qwen2-7b-instruct/1"
# model_name = "/kaggle/input/deberta-v3/pytorch/large/1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Limit the vocabulary size
# tokenizer.model_max_length = max_length
tokenizer.add_tokens(['[CLS]', '[SEP]', '[PAD]'], special_tokens=True)

In [None]:
def tokenize_df(df, tokenizer):
    # Check and set special tokens if they are not present
    if tokenizer.cls_token_id is None:
        tokenizer.cls_token_id = tokenizer.convert_tokens_to_ids('[CLS]')
    if tokenizer.sep_token_id is None:
        tokenizer.sep_token_id = tokenizer.convert_tokens_to_ids('[SEP]')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('[PAD]')
        
    def process(row):
        max_len = max_length - 2 # 2 separator tokens
        # Tokenize prompt
        prompt_tokens = tokenizer(row['prompt'], truncation=True, max_length=max_len//4)['input_ids']
        remaining_length = max_len - len(prompt_tokens)

        # Tokenize response A
        response_a_tokens = tokenizer(row['response_a'], truncation=True, max_length=remaining_length//2)['input_ids']
        remaining_length -= len(response_a_tokens)

        # Tokenize response B
        response_b_tokens = tokenizer(row['response_b'], truncation=True, max_length=remaining_length//2)['input_ids']

        # Add responses
        input_ids = [tokenizer.cls_token_id] + prompt_tokens + [tokenizer.sep_token_id] + response_a_tokens + [tokenizer.sep_token_id] + response_b_tokens
        token_type_ids = [0] * (len(prompt_tokens) + 2) + [1] * (len(response_a_tokens) + 1) + [2] * len(response_b_tokens)
        attention_mask = [1] * len(input_ids)

        # Add padding
        padding_length = max_length - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            token_type_ids = token_type_ids + [0] * padding_length
            attention_mask = attention_mask + [0] * padding_length

        input_ids = input_ids[:max_length]
        token_type_ids = token_type_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        
        return input_ids, token_type_ids, attention_mask
    
    df[['input_ids', 'token_type_ids', 'attention_mask']] = df.apply(lambda row: pd.Series(process(row)), axis=1)
#     tokenized = df.apply(lambda row: pd.Series(process(row)), axis=1)
#     df.loc[:, ['input_ids', 'token_type_ids', 'attention_mask']] = tokenized
#     return df

In [None]:
# Convert labels to categorical
labels = tf.keras.utils.to_categorical(train['label'], num_classes=3)

In [None]:
tokenize_df(train, tokenizer)

In [None]:
# Prepare data for training
input_ids = train['input_ids']
attention_mask = train['attention_mask']

X_train = sq.pad_sequences(input_ids, maxlen=max_length)
X_train_attention_mask = sq.pad_sequences(attention_mask, maxlen=max_length)

y_train = labels

## Model

In [None]:
from keras.layers import concatenate, Dropout, BatchNormalization, LSTM, Conv1D, Masking

# Define the CNN model
def create_cnn_model(vocab_size, embedding_dim, max_length):
    model = Sequential([
        Input(shape=(max_length,), dtype=tf.int32, name='input_ids'),
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=256, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the LSTM model
def create_lstm_model(vocab_size, embedding_dim, max_length):
    model = Sequential([
        Input(shape=(max_length,), dtype=tf.int32, name='input_ids'),
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        LSTM(256, return_sequences=True),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the CNN LSTM model
def create_cnn_lstm_model(vocab_size, embedding_dim, max_length):
    model = Sequential([
        Input(shape=(max_length,), dtype=tf.int32, name='input_ids'),
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        LSTM(128, return_sequences=True),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Parameters
vocab_size = tokenizer.vocab_size
# vocab_size = max_length
embedding_dim = 100
max_length = max_length
max_features = tokenizer.vocab_size
# max_features = max_length * 2
max_len = max_length
maxlen = max_len
batch_size = 16
embedding_dims = 100
nb_filter = 150
filter_length = 3
hidden_dims = 100
nb_epoch = 100
# Create the model
# model = create_lstm_model(vocab_size, embedding_dim, max_length)
model = create_cnn_lstm_model(vocab_size, embedding_dim, max_length)
model.summary()

In [None]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Lambda
from keras.layers import Embedding
from keras.layers import Convolution1D, LSTM
from keras.datasets import imdb
from keras import backend as K
from keras.optimizers import Adadelta,Adamax
from keras.preprocessing import sequence as sq

from keras.layers import Dense, Dropout, Activation, Lambda,Input,TimeDistributed,Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint

In [None]:
# from tensorflow.keras.layers import Layer
# from keras.layers import concatenate, Dropout, BatchNormalization, LSTM, Conv1D
# from keras.layers import  GlobalMaxPooling1D
# import tensorflow as tf

# class ApplyAttentionMask(Layer):
#     def call(self, inputs):
#         embeddings, attention_mask = inputs
#         return embeddings * tf.expand_dims(attention_mask, -1)

# input_layer = Input(shape=(max_length,),dtype='int32', name='main_input')
# attention_masks = Input(shape=(max_length,), dtype='float32', name="attention_masks")

# emb_layer = Embedding(max_features,
#                       embedding_dims,
#                       input_length=max_len
#                       )(input_layer)

# masked_embeddings = ApplyAttentionMask(name='apply_attention_mask')([emb_layer, attention_masks])

# # LSTM branch
# lstm_out = LSTM(128, return_sequences=True)(masked_embeddings)
# lstm_out = LSTM(64, return_sequences=True)(lstm_out)
# lstm_out = LSTM(32)(lstm_out)
# lstm_out = BatchNormalization()(lstm_out)
# lstm_out = Dropout(0.5)(lstm_out)
# lstm_out = GlobalMaxPooling1D()(lstm_out)

# # CNN layer branch
# cnn_out = Conv1D(128, 5, activation='relu')(masked_embeddings)
# cnn_out = Conv1D(64, 5, activation='relu')(cnn_out)
# cnn_out = Conv1D(32, 5, activation='relu')(cnn_out)
# cnn_out = BatchNormalization()(cnn_out)
# cnn_out = Dropout(0.5)(cnn_out)
# cnn_out = GlobalMaxPooling1D()(cnn_out)


# # Concatenate LSTM and CNN outputs
# merged = concatenate([lstm_out, cnn_out])
# merged = Dense(32, activation='sigmoid')(merged)
# merged = BatchNormalization()(merged)
# merged = Dropout(0.5)(merged)
# pred = Dense(3, activation='softmax')(merged)


# # Build model
# model = Model(inputs=[input_layer, attention_masks], outputs=[pred])
# adadelta = Adadelta(learning_rate=1.0, rho=0.75, epsilon=1e-06)
# adamax = Adamax(learning_rate=0.001)
# model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()

In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Conv1D, LSTM, GRU, Dense, Masking
# from tensorflow.keras.models import Model
# from transformers import DebertaTokenizer, AutoModel

# def create_cnn_lstm_hybrid_model(base_model_name, cnn_output_channels, cnn_kernel_size, hidden_dim, num_classes):
#     # Load pre-trained BERT model
#     model = AutoModel.from_pretrained(base_model_name)
    
#     # Define inputs
#     input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
#     attention_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

#     # Get BERT outputs
#     outputs = model(input_ids, attention_mask=attention_mask)
#     seq_output = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)

#     # Apply CNN
#     cnn_output = Conv1D(filters=cnn_output_channels, kernel_size=cnn_kernel_size, padding='same', activation='relu')(bert_seq_output)

#     # Apply Masking to handle padded sequences
#     masked_cnn_output = Masking()(cnn_output)
    
#     # Apply LSTM
#     rnn_output = LSTM(hidden_dim)(masked_cnn_output)

#     # Define the classifier layer
#     logits = Dense(num_classes, activation='softmax')(rnn_output)

#     # Create the model
#     model = Model(inputs=[input_ids, attention_mask], outputs=logits)

#     return model

In [None]:
# Define hyperparameters
# bert_model_name = '/kaggle/input/deberta_v3/keras/deberta_v3_large_en/2'
# bert_model_name = '/kaggle/input/deberta-v3/pytorch/large/1'
# bert_model_name = 'deberta_v3_large_en'
# model_name = '/kaggle/input/qwen2/transformers/qwen2-7b-instruct/1'
# cnn_output_channels = 128
# cnn_kernel_size = 5
# hidden_dim = 256
# num_classes = 3

# # Initialize the model
# model = create_cnn_lstm_hybrid_model(model_name, cnn_output_channels, cnn_kernel_size, hidden_dim, num_classes)
# model.summary()

In [None]:
from keras.callbacks import EarlyStopping

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=1)

history = model.fit([X_train, X_train_attention_mask], y_train, epochs=20, batch_size=32, validation_split=0.2
                    , callbacks=[early_stopping])

## Test

In [None]:
# Encode test data
combine_text(test)
tokenize_df(test, tokenizer)

input_ids = test['input_ids']
attention_mask = test['attention_mask']

X_test = sq.pad_sequences(input_ids, maxlen=max_length)
X_test_attention_mask = sq.pad_sequences(attention_mask, maxlen=max_length)

In [None]:
predictions = model.predict([X_test, X_test_attention_mask])
predictions

In [None]:
winner = pd.DataFrame(predictions, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
result = pd.concat([test['id'], winner], axis=1)
result.to_csv('submission.csv', index=False)
result