# Import libs 

In [None]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import transformers
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification
import torch.nn.functional as F

# Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2')

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True
# save tokenizer to load offline during inference
tokenizer.save_pretrained('tokenizer')

In [None]:
# Utility function giving token length
def get_token_lengths(texts):
    # tokenize and receive input_ids for reach text
    input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in input_ids]

# Prepare train


In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

train.loc[:, 'prompt'] = train['prompt'].apply(process)
train.loc[:, 'response_a'] = train['response_a'].apply(process)
train.loc[:, 'response_b'] = train['response_b'].apply(process)

# Drop 'Null' for training
indexes = train[(train.response_a == 'null') & (train.response_b == 'null')].index
train.drop(indexes, inplace=True)
train.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null response rows dropped")
print('Total train samples: ', len(train))

In [None]:
train.head(5)

In [None]:
train['text'] = 'User prompt: ' + train['prompt'] +  '\n\nModel A :\n' + train['response_a'] +'\n\n--------\n\nModel B:\n'  + train['response_b']
print(train['text'][4])

In [None]:
# Train with only take 50% train dataset
train = train[:int(len(train) * 1)]

train.loc[:, 'token_count'] = get_token_lengths(train['text'])

# prepare label for model
train.loc[:, 'label'] = np.argmax(train[['winner_model_a','winner_model_b','winner_tie']].values, axis=1)

# Display data
display(train.head())

In [None]:
train.label.value_counts()

In [None]:
# token Count
display(train['token_count'].describe().to_frame().astype(int))

In [None]:
# get length of tokens which covers 90% of data, we'll still take 1024 length!
np.percentile(train['token_count'], 90)

# Tokenize

In [None]:
# Tokenize Data
tokens = tokenizer(
    train['text'].tolist(), 
    max_length=1024, 
    truncation=True, 
    return_tensors='np')

# Input IDs are the token IDs
INPUT_IDS = tokens['input_ids']
# Attention Masks to Ignore Padding Tokens
ATTENTION_MASKS = tokens['attention_mask']
# Label of Texts
LABELS = train[['winner_model_a','winner_model_b','winner_tie']].values

print(f'INPUT_IDS shape: {INPUT_IDS.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS.shape}')
print(f'LABELS shape: {LABELS.shape}')

In [None]:
max_features = 21540#14300
maxlen = 1024
batch_size = 16
embedding_dims = 200
nb_filter = 150
filter_length = 3
hidden_dims = 100
nb_epoch = 14

In [None]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Lambda
from keras.layers import Embedding
from keras.layers import Convolution1D, LSTM
from keras.datasets import imdb
from keras import backend as K
from keras.optimizers import Adadelta
from keras.preprocessing import sequence as sq

from keras.layers import Dense, Dropout, Activation, Lambda,Input,TimeDistributed,Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint

from tensorflow.python.keras.backend import set_session as K
X_train = sq.pad_sequences(INPUT_IDS, maxlen=maxlen)
y_train = LABELS

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)

# Define Model

In [None]:
'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.88 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''
from keras.layers import Concatenate
from keras.layers import  GlobalMaxPooling1D

#config = K.tf.ConfigProto(intra_op_parallelism_threads=16, inter_op_parallelism_threads=16, \
#                        allow_soft_placement=True, device_count = {'CPU': 1})


# tf_config = K.tf.ConfigProto()
# tf_config.gpu_options.allow_growth = True
# session = K.tf.Session(config=tf_config)
# K.set_session(session)

# config = K.tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4, \
#                         allow_soft_placement=True, device_count = {'CPU': 4})
# session = K.tf.Session(config=config)
# K.set_session(session)



model = Sequential()

input_layer = Input(shape=(maxlen,),dtype='int64', name='main_input')
emb_layer = Embedding(max_features,
                      embedding_dims,
                      input_length=maxlen
                      )(input_layer)
def max_1d(X):
    return K.max(X, axis=1)

# we add a Convolution1D, which will learn nb_filter
# word group filters of size 3:

con3_layer = Convolution1D(filters=nb_filter,
                    padding='valid',
                    activation='relu',
                    kernel_size =3,
                    strides=1)(emb_layer)

pool_con3_layer = GlobalMaxPooling1D()(con3_layer)


# we add a Convolution1D, which will learn nb_filter
# word group filters of size 4:

con4_layer = Convolution1D(filters=nb_filter,
                    kernel_size=5,
                    padding='valid',
                    activation='relu',
                    strides=1)(emb_layer)

pool_con4_layer = GlobalMaxPooling1D()(con4_layer)


# we add a Convolution1D, which will learn nb_filter
# word group filters of size 5:

con5_layer = Convolution1D(filters=nb_filter,
                    kernel_size=7,
                    padding='valid',
                    activation='relu',
                    strides=1)(emb_layer)

pool_con5_layer = GlobalMaxPooling1D()(con5_layer)


cnn_layer =Concatenate()([pool_con3_layer, pool_con5_layer, pool_con4_layer])


#LSTM


x = Embedding(max_features, embedding_dims, input_length=maxlen)(input_layer)
lstm_layer = LSTM(128)(x)

cnn_lstm_layer = Concatenate()([lstm_layer, cnn_layer])

dense_layer = Dense(hidden_dims*2, activation='sigmoid')(cnn_lstm_layer)
output_layer= Dropout(0.2)(dense_layer)
output_layer = Dense(3, trainable=True,activation='softmax')(output_layer)




model = Model(inputs=[input_layer], outputs=[output_layer])
adadelta = Adadelta(learning_rate=1.0, rho=0.95, epsilon=1e-06)

model.compile(loss='categorical_crossentropy',
              optimizer="adamax",
              metrics=['accuracy'])
model.summary()


# Training

In [None]:
checkpoint = ModelCheckpoint('CNN-LSTM-weights/weights.keras',
                                 monitor='val_acc', verbose=0, save_best_only=True,
                                 mode='max')
model.fit(X_train, y_train,
          batch_size=16,
          epochs=nb_epoch,
          callbacks=[checkpoint])

model.compile(loss='categorical_crossentropy',
              optimizer="adamax",
              metrics=['accuracy'])

In [None]:
model.save('model_LSTM_mix_CNN.keras')  # Lưu toàn bộ model

In [None]:
model.predict(X_train)

# Test Model


In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')


test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

# Drop 'Null' for training
indexes = test[(test.response_a == 'null') & (test.response_b == 'null')].index
test.drop(indexes, inplace=True)
test.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null response rows dropped")
print('Total train samples: ', len(test))

In [None]:
test.head()

In [None]:
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + train['response_b']
print(test['text'])

In [None]:
# Tokenize Data
tokens_test = tokenizer(
    test['text'].tolist(), 
    max_length=1024, 
    truncation=True, 
    return_tensors='np')

# Input IDs are the token IDs
INPUT_test = tokens_test['input_ids']
# Attention Masks to Ignore Padding Tokens
ATTENTION_MASKS2 = tokens_test['attention_mask']


print(f'INPUT_IDS shape: {INPUT_test.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS2.shape}')

In [None]:
X_test = sq.pad_sequences(INPUT_test, maxlen=maxlen)

In [None]:
test

In [None]:
y_predict = model.predict(X_test)
y_predict

In [None]:
winner_df = pd.DataFrame(y_predict, columns=['winner_model_a', 'winner_model_b', 'winner_tie'])
result_df = pd.concat([test['id'], winner_df], axis=1)

In [None]:
result_df.to_csv('submission.csv', index=False)

In [None]:
result_df

# Conclusion 

There is still alot of room to speed up and optimize training! Try out more data, different batch size, lr... All the best!