In [None]:
import os
import tensorflow as tf
from datasets import load_dataset, DatasetDict
from transformers import BertTokenizer, BertTokenizerFast, TFBertModel, DataCollatorWithPadding, TFAutoModel
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Lambda, Layer, Input, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model
import shutil
import pandas as pd
from tqdm.keras import TqdmCallback
import re
import math
import matplotlib.pyplot as plt
import multiprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.callbacks import LearningRateScheduler
import numpy as np
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

In [None]:
# Check for GPU availability
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# path for sets
train_path = '/kaggle/input/lmsys-chatbot-arena/train.csv'
test_path = '/kaggle/input/lmsys-chatbot-arena/test.csv'

# loading datasets
train_dataset = load_dataset('csv', data_files={'train': train_path})['train']
test_dataset = load_dataset('csv', data_files={'test': test_path})['test']

# saving ID
test_ids = test_dataset['id']

In [None]:
# adding missing columns in the test set
for col in ['model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie']:
    if col not in test_dataset.column_names:
        test_dataset = test_dataset.add_column(col, [""] * len(test_dataset))

# transformation to int64
for col in ['winner_model_a', 'winner_model_b', 'winner_tie']:
    train_dataset = train_dataset.map(lambda x: {col: int(x[col]) if x[col] is not None else 0})
    test_dataset = test_dataset.map(lambda x: {col: int(x[col]) if x[col] != "" else 0})

In [None]:
# using bert-base-cased's files locally
source_dir = '/kaggle/input/huggingface-bert/bert-base-cased'

model_dir = '/kaggle/working/bert-base-cased'
os.makedirs(model_dir, exist_ok=True)

shutil.copy(os.path.join(source_dir, 'config.json'), model_dir)
shutil.copy(os.path.join(source_dir, 'pytorch_model.bin'), model_dir)
shutil.copy(os.path.join(source_dir, 'tf_model.h5'), model_dir)
shutil.copy(os.path.join(source_dir, 'tokenizer.json'), model_dir)
shutil.copy(os.path.join(source_dir, 'vocab.txt'), model_dir)
shutil.copy(os.path.join(source_dir, 'modelcard.json'), model_dir)

In [None]:
stopwords_path = '/kaggle/input/stopwords/stopwords/english'

# Функция для загрузки стоп-слов из файла
def load_stopwords(stopwords_path):
    with open(stopwords_path, 'r') as file:
        stopwords = file.read().splitlines()
    return set(stopwords)

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_dir)
# download stopwords
stopwords = load_stopwords(stopwords_path)
# Function for text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Удалить пунктуацию
    text = ' '.join([word for word in text.split() if word not in stopwords])  # Удалить стоп-слова
    return text


In [None]:
def tokenize_function(examples):
    # Clean each text field
    cleaned_prompts = [clean_text(text) for text in examples['prompt']]
    cleaned_responses_a = [clean_text(text) for text in examples['response_a']]
    cleaned_responses_b = [clean_text(text) for text in examples['response_b']]
    
    # Tokenize the cleaned texts
    return tokenizer(cleaned_prompts,
                     cleaned_responses_a,
                     cleaned_responses_b,
                     padding="max_length", 
                     truncation=True, 
                     max_length=512)

In [None]:
# Пример использования функции
examples = {
    'prompt': ["This is a sample prompt."],
    'response_a': ["This is a sample response A."],
    'response_b': ["This is a sample response B."]
}

tokenized_output = tokenize_function(examples)
print(tokenized_output)

In [None]:
# apply the tokenization and cleaning function with multiprocessing num_proc=num_proc
num_proc = multiprocessing.cpu_count()

# add try-except block for better error handling
try:
    tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
    test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
except Exception as e:
    print(f"Error during tokenization: {e}")
    
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Add debug prints after tokenization
print("Sample tokenized train dataset entry:")
print(tokenized_datasets[0])
if len(tokenized_datasets) == 0:
    raise ValueError("The tokenized training dataset is empty.")
if len(test_tokenized_datasets) == 0:
    raise ValueError("The tokenized test dataset is empty.")
# Print column names for debugging
print(f"Tokenized training dataset columns: {tokenized_datasets.column_names}")
print(f"Tokenized test dataset columns: {test_tokenized_datasets.column_names}")

In [None]:
# convert to tf.data.Dataset with the correct shape
def convert_to_tf_dataset(dataset, label_col=None, for_inference=False):
    input_columns = tokenizer.model_input_names
    
    if label_col and not for_inference:
        dataset = dataset.remove_columns([col for col in dataset.column_names if col != label_col and col not in input_columns])
    else:
        dataset = dataset.remove_columns([col for col in dataset.column_names if col not in input_columns])
    
    # ensure labels are not sequences
    if label_col:
        dataset = dataset.map(lambda x: {label_col: int(x[label_col])})
    
    shuffle = not for_inference
    batch_size = 16 if for_inference else 450

    tf_dataset = dataset.to_tf_dataset(
        columns=input_columns,
        label_cols=[label_col] if label_col and not for_inference else None,
        shuffle=shuffle,
        batch_size=batch_size,
        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    return tf_dataset

In [None]:
# run the conversion
try:
    train_tf_dataset = convert_to_tf_dataset(tokenized_datasets, 'winner_model_a')
    test_tf_dataset = convert_to_tf_dataset(tokenized_datasets, 'winner_model_a')
except Exception as e:
    print(f"Error during dataset conversion: {e}")

In [None]:
# add debug prints after dataset conversion
print("Sample from converted train tf.data.Dataset:")
for batch in train_tf_dataset.take(1):
    inputs, labels = batch
    print(f'Input IDs shape: {inputs["input_ids"].shape}')
    print(f'Attention mask shape: {inputs["attention_mask"].shape}')
    print(f'Labels shape: {labels.shape}')


In [None]:
# building a custom model
class BertLayer(Layer):
    def __init__(self, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert = TFBertModel.from_pretrained(model_dir, from_pt=True)
    
    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state

def create_keras_model():
    input_ids = Input(shape=(512,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(512,), dtype=tf.int32, name='attention_mask')

    bert_output = BertLayer()([input_ids, attention_mask])
    pooled_output = GlobalAveragePooling1D()(bert_output)
    output = Dense(3, activation='softmax')(pooled_output)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

In [None]:
with strategy.scope():
    model = create_keras_model()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(train_tf_dataset, epochs=3, callbacks=[TqdmCallback(verbose=1)])

In [None]:
# getting prediction
predictions = model.predict(test_tf_dataset)

# check lengths
print(f"Length of test_ids: {len(test_ids)}")
print(f"Shape of predictions: {predictions.shape}")
if len(test_ids) != predictions.shape[0]:
    predictions = predictions[:len(test_ids)]

# creating DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'winner_model_a': predictions[:, 0],
    'winner_model_b': predictions[:, 1],
    'winner_model_tie': predictions[:, 2]
})

# saving DataFrame
submission.to_csv('submission.csv', index=False)
