# Import Libraries 

In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"  # or "jax" or "torch"
import re

import keras_nlp
import keras
import tensorflow as tf

import numpy as np 
import pandas as pd
from tqdm import tqdm
import json

import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px

# Num GPUs Available

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

# Configuration

In [None]:
class CFG:
    seed = 42  # Random seed
    preset = "deberta_v3_extra_small_en"
    sequence_length = 512
    epochs = 6
    batch_size = 16
    scheduler = 'cosine'  # Learning rate scheduler
    label2name = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_tie'}
    name2label = {v:k for k, v in label2name.items()}
    class_labels = list(label2name.keys())
    class_names = list(label2name.values())

# Reproducibility 
Sets value for random seed to produce similar result in each run.

In [None]:
keras.utils.set_random_seed(CFG.seed)

#  Mixed Precision

In this notebook, we will use mixed precision instead of float32 precision for training and inference to reduce GPU memory usage. This will ultimately allow us to use larger batch sizes, thus reducing our training and inference time.

In [None]:
keras.mixed_precision.set_global_policy("mixed_float16")

# Dataset Path 

In [None]:
BASE_PATH = '/kaggle/input/lmsys-chatbot-arena'

# Meta Data 
## Files

### `train.csv`
- `id`: Unique identifier for each row.
- `model_[a/b]`: Model identity, present in train.csv but not in test.csv.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.
- `winner_model_[a/b/tie]`: Binary columns indicating the judge's selection (ground truth target).

### `test.csv`
- `id`: Unique identifier for each row.
- `prompt`: Input prompt given to both models.
- `response_[a/b]`: Model_[a/b]'s response to the prompt.

> Note that each interaction may have multiple prompts and responses, but this notebook will use only **one prompt per interaction**. You can choose to use all prompts and responses. Additionally, prompts and responses in the dataframe are provided as string-formatted lists, so they need to be converted to literal lists using `eval()`.


In [None]:
# Load Train Data
df = pd.read_csv(f'{BASE_PATH}/train.csv') 
ultrachat_df = pd.read_csv('/kaggle/input/ultrachat-train/ultrachat_s42_a0.5.csv')
df = pd.concat([df, ultrachat_df], axis=0)
lmsys_33k_deduplicated = pd.read_csv('/kaggle/input/lmsys-33k-deduplicated/lmsys-33k-deduplicated.csv')
df = pd.concat([df, lmsys_33k_deduplicated], axis=0)
# ultrafeedback_lmsysformat = pd.read_parquet('/kaggle/input/ultrafeedback-lmsysformat/ultrafeedback_lmsysformat.parquet', engine='pyarrow')
# ultrafeedback_lmsysformat['prompt'] = ultrafeedback_lmsysformat['prompt'].apply(lambda x: f'["{x}"]')
# df = pd.concat([df, ultrafeedback_lmsysformat], axis=0)

# Load Test Data
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')

# display(ultrafeedback_lmsysformat.head())
display(df.head())

In [None]:
df = df.drop("id", axis=1)
df = df.drop_duplicates(keep="first", ignore_index=True)

for col in ["prompt"]:
    df[col] = df[col].apply(lambda x: eval(x))
    test_df[col] = test_df[col].apply(lambda x: eval(x))
for col in ["response_a", "response_b"]:
    df[col] = df[col].apply(lambda x: eval(x.replace("null", "None")))
    test_df[col] = test_df[col].apply(lambda x: eval(x.replace("null", "None")))
    
# Sample data
# df = df.sample(frac=0.01)

# Label conversion
df["class_name"] = df[["winner_model_a", "winner_model_b" , "winner_tie"]].idxmax(axis=1)
df["class_label"] = df.class_name.map(CFG.name2label)

# Show Sample
display(df.head())
# Show Sample
display(test_df.head())

## Contextualize Response with Prompt

In our approach, we will contextualize each response with the prompt instead of using a single prompt for all responses. This means that for each response, we will provide the model with the same set of prompts combined with their respective response (e.g., `(P + R_A)`, `(P + R_B)`, etc.). This approach is similar to the multiple-choice question task in NLP.

> Note that some prompts and responses may not be encoded with `utf-8`, resulting in errors when creating the dataloader. In such cases, we will replace them with an empty string.


In [None]:
def make_pairs(row):
    row['options'] = []
    row["encode_fail"] = False

    try:
        # 确保所有需要的键都存在于row字典中
        prompts = row['prompt']
        responses_a = row['response_a']
        responses_b = row['response_b']
        
        # 检查列表长度是否匹配
        if not (len(prompts) == len(responses_a) == len(responses_b)):
            raise ValueError("The lists 'prompt', 'response_a', and 'response_b' must be of the same length.")
            
        response_a_str = ''
        response_b_str = ''
        
        for idx in range(len(prompts)):
            response_a_str += f"Prompt: {prompts[idx]}\n\nResponse: {responses_a[idx]}"
            response_b_str += f"Prompt: {prompts[idx]}\n\nResponse: {responses_b[idx]}"
        
        # 文本清洗，例如去除无法识别的Unicode字符或替换它们
        clean_response_a_str = "".join(filter(lambda x: ord(x) < 128, response_a_str))
        clean_response_a_str = "".join(filter(lambda x: ord(x) < 128, response_b_str))
        
        row['options'].append(clean_response_a_str)
        row['options'].append(clean_response_a_str)
        
    except KeyError as e:
        print(f"Missing key in row: {e}")
        row["encode_fail"] = True
    except ValueError as e:
        print(e)
        row["encode_fail"] = True
    except Exception as e:
        # 捕获其他所有异常
        print(f"An unexpected error occurred: {e}")
        row["encode_fail"] = True

    return row

In [None]:
df = df.apply(make_pairs, axis=1)
display(df.head(2))

test_df = test_df.apply(make_pairs, axis=1)
display(test_df.head(2))

## Encoding Fail Statistics

Let's examine how many samples have encoding issues. From the code below, we can see that only $1\%$ of the samples failed to be encoded, while $99\%$ of the samples don't have any issues. A similar pattern can be expected for the test data as well. Thus, considering empty strings for this small portion of the data will not have much impact on our training and inference.

In [None]:
df.encode_fail.value_counts(normalize=False)

# EDA

In [None]:
class DataFrameStatsProcessor:
    def __init__(self, df):
        self.df = df

    def _is_empty(self, string: str) -> bool:
        return bool(re.match("^\s*$", string))

    def _len(self, string: str) -> int:
        if string is None:
            return 0
        return len(string)

    def _add_len_stats(self, col: str) -> pd.DataFrame:
        if col == "prompt":
            col_prefix = "p_len"
        elif col == "response_a":
            col_prefix = "res_a_len"
        elif col == "response_b":
            col_prefix = "res_b_len"
        
        self.df[f"{col_prefix}_sum"] = self.df[col].apply(lambda x: sum(self._len(s) for s in x))
        self.df[f"{col_prefix}_mean"] =  self.df[col].apply(lambda x: np.mean(list(self._len(s) for s in x)))
        self.df[f"{col_prefix}_max"] = self.df[col].apply(lambda x: max(self._len(s) for s in x))
        self.df[f"{col_prefix}_sum_log"] = np.log1p(self.df[f"{col_prefix}_sum"])
        self.df[f"{col_prefix}_mean_log"] =  np.log1p(self.df[f"{col_prefix}_mean"])
        self.df[f"{col_prefix}_max_log"] = np.log1p(self.df[f"{col_prefix}_max"])
        
        return self.df
    
    def z_score_normalize(self, columns):
        """
        对指定的列进行Z得分归一化。
        参数:
            columns (list): 需要进行Z得分归一化的列名列表。
        """
        for col in columns:
            self.df[col] = (self.df[col] - self.df[col].mean()) / self.df[col].std()
    
    def process_dataframe(self):
        self.df["n_prompts"] = self.df["prompt"].apply(lambda x: len(x))
        self.df["n_res_a"] = self.df["response_a"].apply(lambda x: len(x))
        self.df["n_res_b"] = self.df["response_b"].apply(lambda x: len(x))
        assert ((self.df["n_prompts"] == self.df["n_res_a"]) & (self.df["n_prompts"] == self.df["n_res_b"])).all()

        self.df["n_na_prompts"] = self.df["prompt"].apply(lambda ps: sum(1 if p is None else 0 for p in ps))
        self.df["n_empty_prompts"] = self.df["prompt"].apply(lambda ps: sum(1 if p is not None and self._is_empty(p) else 0 for p in ps))
        self.df["n_na_res_a"] = self.df["response_a"].apply(lambda ps: sum(1 if p is None else 0 for p in ps))
        self.df["n_empty_res_a"] = self.df["response_a"].apply(lambda ps: sum(1 if p is not None and self._is_empty(p) else 0 for p in ps))
        self.df["n_na_res_b"] = self.df["response_b"].apply(lambda ps: sum(1 if p is None else 0 for p in ps))
        self.df["n_empty_res_b"] = self.df["response_b"].apply(lambda ps: sum(1 if p is not None and self._is_empty(p) else 0 for p in ps))

        self.df["n_miss_res_a"] = self.df["n_na_res_a"] + self.df["n_empty_res_a"]
        self.df["n_miss_res_b"] = self.df["n_na_res_b"] + self.df["n_empty_res_b"]

        self.df["n_eff_res_a"] = self.df["n_res_a"] - self.df["n_miss_res_a"]
        self.df["n_eff_res_b"] = self.df["n_res_b"] - self.df["n_miss_res_b"]

        self._add_len_stats("prompt")
        self._add_len_stats("response_a")
        self._add_len_stats("response_b")

        self.df["res_len_mean_diff"] = self.df["res_a_len_mean"] - self.df["res_b_len_mean"]
        self.df["res_len_mean_diff_clip"] = self.df["res_len_mean_diff"].clip(-6000, 6000)

        self.df["n_miss_prompts"] = self.df["n_na_prompts"] + self.df["n_empty_prompts"]
        self.df["n_eff_prompts"] = self.df["n_prompts"] - self.df["n_miss_prompts"]

        self.df["na_prompt_ratio"] = self.df["n_na_prompts"] / self.df["n_prompts"]
        self.df["empty_prompt_ratio"] = self.df["n_empty_prompts"] / self.df["n_prompts"]
        self.df["miss_prompt_ratio"] = self.df["n_miss_prompts"] / self.df["n_prompts"]

        self.df["na_res_a_ratio"] = self.df["n_na_res_a"] / self.df["n_res_a"]
        self.df["empty_res_a_ratio"] = self.df["n_empty_res_a"] / self.df["n_res_a"]
        self.df["miss_res_a_ratio"] = self.df["n_miss_res_a"] / self.df["n_res_a"]
        self.df["na_res_b_ratio"] = self.df["n_na_res_b"] / self.df["n_res_b"]
        self.df["empty_res_b_ratio"] = self.df["n_empty_res_b"] / self.df["n_res_b"]
        self.df["miss_res_b_ratio"] = self.df["n_miss_res_b"] / self.df["n_res_b"]

        for col, col_prefix in zip(["prompt", "response_a", "response_b"], ["p_len", "res_a_len", "res_b_len"]):
            self.df[f"{col_prefix}_med"] = self.df[col].apply(lambda x: np.median(list(self._len(s) for s in x)))
            self.df[f"{col_prefix}_std"] = self.df[col].apply(lambda x: np.std(list(self._len(s) for s in x)))

        self.df["p_len_eff_mean"] = self.df["p_len_sum"] / self.df["n_eff_prompts"]
        self.df["res_a_len_eff_mean"] = self.df["res_a_len_sum"] / self.df["n_eff_res_a"]
        self.df["res_b_len_eff_mean"] = self.df["res_b_len_sum"] / self.df["n_eff_res_b"]

        for stats in ["sum", "mean", "max", "med", "eff_mean"]:
            self.df[f"p_a_{stats}_diff"] = self.df[f"p_len_{stats}"] - self.df[f"res_a_len_{stats}"]
            self.df[f"p_b_{stats}_diff"] = self.df[f"p_len_{stats}"] - self.df[f"res_b_len_{stats}"]
            self.df[f"a_b_{stats}_diff"] = self.df[f"res_a_len_{stats}"] - self.df[f"res_b_len_{stats}"]
            
        len_feature_a_col = ["res_a_len_sum","res_a_len_mean","res_a_len_max","res_a_len_sum_log","res_a_len_mean_log","res_a_len_max_log",
                     "res_a_len_med","res_a_len_std","res_a_len_eff_mean","p_a_sum_diff","p_a_mean_diff","p_a_max_diff","p_a_med_diff",
                     "p_a_eff_mean_diff"]
        
        len_feature_b_col = ["res_b_len_sum","res_b_len_mean","res_b_len_max","res_b_len_sum_log","res_b_len_mean_log","res_b_len_max_log",
                             "res_b_len_med","res_b_len_std","res_b_len_eff_mean","p_b_sum_diff","p_b_mean_diff","p_b_max_diff","p_b_med_diff",
                             "p_b_eff_mean_diff"]
        
        numerical_feature_columns = ["res_a_len_sum","res_a_len_mean","res_a_len_max","res_a_len_sum_log","res_a_len_mean_log","res_a_len_max_log",
                                     "res_a_len_med","res_a_len_std","res_a_len_eff_mean","p_a_sum_diff","p_a_mean_diff","p_a_max_diff","p_a_med_diff",
                                     "p_a_eff_mean_diff", "res_b_len_sum","res_b_len_mean","res_b_len_max","res_b_len_sum_log","res_b_len_mean_log","res_b_len_max_log",
                                     "res_b_len_med","res_b_len_std","res_b_len_eff_mean","p_b_sum_diff","p_b_mean_diff","p_b_max_diff","p_b_med_diff",
                                     "p_b_eff_mean_diff"]
        # 确保不除以零进行归一化
        for col in numerical_feature_columns:
            if self.df[col].std() == 0:
                print(f"Warning: Standard deviation is zero for column {col}. Skipping normalization.")
            else:
                self.z_score_normalize([col])
                
        self.df = self.df.fillna(0)
        
        # 选择这些列并将它们转换为列表
        len_features_a = self.df[len_feature_a_col].values.tolist()
        len_features_b = self.df[len_feature_b_col].values.tolist()

        return len_features_a, len_features_b

# Data Split

In the code snippet provided below, we will divide the existing data into training and validation using a stratification of `class_label` column.

In [None]:
from sklearn.model_selection import train_test_split  # Import package

train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df["class_label"])

# Preprocessing

**What it does:** The preprocessor takes input strings and transforms them into a dictionary (`token_ids`, `padding_mask`) containing preprocessed tensors. This process starts with tokenization, where input strings are converted into sequences of token IDs.

**Why it's important:** Initially, raw text data is complex and challenging for modeling due to its high dimensionality. By converting text into a compact set of tokens, such as transforming `"The quick brown fox"` into `["the", "qu", "##ick", "br", "##own", "fox"]`, we simplify the data. Many models rely on special tokens and additional tensors to understand input. These tokens help divide input and identify padding, among other tasks. Making all sequences the same length through padding boosts computational efficiency, making subsequent steps smoother.

Explore the following pages to access the available preprocessing and tokenizer layers in **KerasNLP**:
- [Preprocessing](https://keras.io/api/keras_nlp/preprocessing_layers/)
- [Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)

In [None]:
preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset, 
    sequence_length=CFG.sequence_length, 
)

We'll use the `preprocessing_fn` function to transform each text option using the `dataset.map(preprocessing_fn)` method.

In [None]:
def preprocess_fn(text, label=None, features_a=None, features_b=None):
    text = preprocessor(text)
    if features_a is not None:
        text['features_a'] = features_a
    if features_b is not None:
         text['features_b'] = features_b
    return (text, label) if label is not None else text  # Return processed text and label if available

# FGM

In [None]:
# # 添加 FGM 扰动函数
# def fgm_perturb(features, epsilon=1.0):
#     # 计算扰动量，epsilon 为扰动比例
#     perturbation = np.random.uniform(-1, 1, features.shape) * epsilon
#     # 应用扰动
#     return features + perturbation

In [None]:
# # 修改数据预处理函数以包含 FGM 扰动
# def preprocess_fn(text, label=None, features_a=None, features_b=None, is_fgm=False, epsilon=1.0):
#     # 预处理文本
#     text = preprocessor(text)
#     if features_a is not None:
#         if is_fgm:
#             # 如果是 FGM，应用扰动
#             features_a = fgm_perturb(features_a, epsilon)
#         text['features_a'] = features_a
#     if features_b is not None:
#         if is_fgm:
#             # 如果是 FGM，应用扰动
#             features_b = fgm_perturb(features_b, epsilon)
#         text['features_b'] = features_b
#     return (text, label) if label is not None else text

# AWP

In [None]:
#定义 AWP 扰动函数
def awp_perturb(model, epsilon=1e-4):
    for layer in model.layers:
        if hasattr(layer, 'kernel'):
            # 获取权重
            weights = layer.kernel
            # 计算扰动
            perturbation = tf.random.normal(weights.shape, stddev=epsilon)
            # 应用扰动
            layer.kernel.assign_add(perturbation)

#创建 AWP 回调函数
class AWPCallback(keras.callbacks.Callback):
    def __init__(self, epsilon):
        super(AWPCallback, self).__init__()
        self.epsilon = epsilon

    def on_batch_begin(self, batch, logs=None):
        # 在每个批次开始时应用 AWP 扰动
        awp_perturb(self.model, self.epsilon)

# 🍚 | DataLoader

The code below sets up a robust data flow pipeline using `tf.data.Dataset` for data processing. Notable aspects of `tf.data` include its ability to simplify pipeline construction and represent components in sequences.

To learn more about `tf.data`, refer to this [documentation](https://www.tensorflow.org/guide/data).

In [None]:
def build_dataset_with_features(texts, labels=None, features_a=None, features_b=None, batch_size=32, is_fgm=False,  epsilon=1.0,
                                cache=True, shuffle=1024):
    AUTO = tf.data.AUTOTUNE
    if (features_a is not None) and (features_b is not None):
        slices = (texts, None, features_a, features_b) if labels is None else (texts, keras.utils.to_categorical(labels, num_classes=3), features_a, features_b)  # Create slices
    else:
        slices = (texts,) if labels is None else (texts, keras.utils.to_categorical(labels, num_classes=3))  # Create slices
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.cache() if cache else ds
    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)
#     ds = ds.map(lambda x: preprocess_fn(x, features_a=features_a, features_b=features_b, is_fgm=is_fgm, epsilon=epsilon),
#                 num_parallel_calls=tf.data.AUTOTUNE)
    opt = tf.data.Options()
    if shuffle:
        ds = ds.shuffle(shuffle, seed=CFG.seed)
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=False)
    ds = ds.prefetch(AUTO)
    
    return ds

## Build Train/Valid Dataloader

In [None]:
train_features_processor = DataFrameStatsProcessor(train_df.copy())
train_features_a, train_features_b = train_features_processor.process_dataframe()
valid_features_processor = DataFrameStatsProcessor(valid_df.copy())
valid_features_a, valid_features_b = valid_features_processor.process_dataframe()

In [None]:
# # Train
train_texts = train_df.options.tolist()  
train_labels = train_df.class_label.tolist() 
train_ds = build_dataset_with_features(train_texts, train_labels, train_features_a, train_features_b, 
                         batch_size=CFG.batch_size,
                         shuffle=True)
# # Valid
valid_texts = valid_df.options.tolist()  
valid_labels = valid_df.class_label.tolist() 
valid_ds = build_dataset_with_features(valid_texts, valid_labels, valid_features_a, valid_features_b, 
                         batch_size=CFG.batch_size,
                         shuffle=False)
print(train_ds)

# LR Schedule

Implementing a learning rate scheduler is crucial for transfer learning. The learning rate initiates at `lr_start` and gradually tapers down to `lr_min` using various techniques, including:
- `step`: Lowering the learning rate in step-wise manner resembling stairs.
- `cos`: Utilizing a cosine curve to gradually reduce the learning rate.
- `exp`: Exponentially decreasing the learning rate.

**Importance:** A well-structured learning rate schedule is essential for efficient model training, ensuring optimal convergence and avoiding issues such as overshooting or stagnation.

In [None]:
import math

def get_lr_callback(batch_size=8, mode='cos', epochs=10, plot=False):
    lr_start, lr_max, lr_min = 1.0e-6, 0.6e-6 * batch_size, 1e-6
    lr_ramp_ep, lr_sus_ep, lr_decay = 2, 0, 0.8

    def lrfn(epoch):  # Learning rate update function
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # Plot lr curve if plot is True
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch'); plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)  # Create lr callback

In [None]:
lr_cb = get_lr_callback(CFG.batch_size, epochs=CFG.epochs, plot=True)

# Model Checkpointing

The following code will create a callback that will save the best checkpoint of the model during training, which we will use for inference in the submission.

In [None]:
ckpt_cb = keras.callbacks.ModelCheckpoint(f'best_model.weights.h5',
                                          monitor='val_log_loss',
                                          save_best_only=True,
                                          save_weights_only=True,
                                          mode='min')  # Get Model checkpoint callback

# Metric

The metric for this competition is **Log Loss**. This metric can be expressed mathematically as,

$$
\text{Log Loss} = -\frac{1}{N} \sum_{i=1}^{N} \left( y_i \log(p_i) + (1 - y_i) \log(1 - p_i) \right)
$$

where $ N $ is the number of samples, $ y_i $ is the true label, and $ p_i $ is the predicted probability of the sample belonging to the positive class.

Note that this metric is similar to categorical cross entropy widely used in classification tasks. Thus, we don't need to implement the loss from scratch. As the Keras library already has an implementation of this metric, we will simply use the metric to monitor performance of our model.


In [None]:
log_loss = keras.metrics.CategoricalCrossentropy(name="log_loss", label_smoothing=0.1, from_logits=False)

# Modeling

In [None]:
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout

with strategy.scope():

    # 将所有输入层整合到一个字典中
    inputs = {
        "token_ids": keras.layers.Input(shape=(2, None), dtype=tf.int32, name="token_ids"),
        "padding_mask": keras.layers.Input(shape=(2, None), dtype=tf.int32, name="padding_mask"),
        "features_a": keras.layers.Input(shape=(14,), name="features_a", dtype=tf.float32),
        "features_b": keras.layers.Input(shape=(14,), name="features_b", dtype=tf.float32),
    }
    
    # Create a DebertaV3Classifier backbone
    backbone = keras_nlp.models.DebertaV3Backbone.from_preset(
        CFG.preset,
    )

   # 修改 response_a 和 response_b 的创建方式，包含 padding_mask
    response_a = {
        "token_ids": inputs["token_ids"][:, 0, :],
        "padding_mask": inputs["padding_mask"][:, 0, :]
    }
    embed_a = backbone(response_a)

    response_b = {
        "token_ids": inputs["token_ids"][:, 1, :],
        "padding_mask": inputs["padding_mask"][:, 1, :]
    }
    embed_b = backbone(response_b)
    
    # 将数值特征嵌入
    len_features_a_embedding = keras.layers.Dense(512, activation='relu')(inputs["features_a"])
    len_features_b_embedding = keras.layers.Dense(512, activation='relu')(inputs["features_b"])
    
    # 使用 Flatten 层将数值特征嵌入展平为二维张量
    flattened_len_features_a = keras.layers.Flatten()(len_features_a_embedding)
    flattened_len_features_b = keras.layers.Flatten()(len_features_b_embedding)
    
    embed_a = keras.layers.GlobalAveragePooling1D()(embed_a)
    embed_b = keras.layers.GlobalAveragePooling1D()(embed_b)
    embeds_text_features_a = keras.layers.Concatenate(axis=-1)([embed_a, flattened_len_features_a])
    embeds_text_features_b = keras.layers.Concatenate(axis=-1)([embed_b, flattened_len_features_b])
    
    # 合并文本嵌入和数值特征嵌入
    combined_embeds = keras.layers.Concatenate(axis=-1)([embeds_text_features_a, embeds_text_features_a])
    
    # 添加L2正则化和Dropout到模型中
    combined_embeds = keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(1e-5))(combined_embeds)  # L2正则化
    combined_embeds = Dropout(0.5)(combined_embeds)  # Dropout层，丢弃50%的神经元
    
    # 定义 temperature_scale 函数
    def temperature_scale(logits, T=1.0):
        return logits / T
    
    # 定义温度参数 T
    T = 0.85
    # 应用温度缩放
    scaled_logits = temperature_scale(combined_embeds, T)
    outputs = keras.layers.Dense(3, activation="softmax", name="classifier")(scaled_logits)
    
    model = keras.Model(inputs,  outputs)
    
    # Compile the model with optimizer, loss, and metrics
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-6, clipnorm=1.0),
        loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.1, from_logits=False),
        metrics=[
            log_loss,
            keras.metrics.CategoricalAccuracy(name="accuracy"),
        ],
    )
    
    # 添加 AWP 回调到模型训练中
    awp_cb = AWPCallback(epsilon=1e-4)  # 您可以根据需要调整 epsilon 的值

### Model Summary

In [None]:
model.summary()

# Training

In [None]:
# try:
#     history = model.fit(
#         train_ds,
#         epochs=CFG.epochs,
#         validation_data=valid_ds,
#         callbacks=[lr_cb, ckpt_cb]
#     )
# except tf.errors.InvalidArgumentError as e:
#     print(f"出现无效参数错误：{e}")
try:
    history = model.fit(
        train_ds,
        epochs=CFG.epochs,
        validation_data=valid_ds,
        callbacks=[lr_cb, ckpt_cb, awp_cb]  # 将 AWP 回调添加到训练回调列表中
    )
except tf.errors.InvalidArgumentError as e:
    print(f"出现无效参数错误：{e}")

## Load Best Model

After training, let's load the weight with best result to get the best performance.

In [None]:
model.load_weights('/kaggle/working/best_model.weights.h5')

# Prediction

In [None]:
# # 使用 FGM 扰动的数据集评估模型
# fgm_ds = build_dataset_with_features(train_texts, train_labels, train_features_a, train_features_b,
#                                      is_fgm=True, epsilon=1.0)
# evaluation_results = model.evaluate(fgm_ds)

# print(f"Evaluation results on FGM perturbed dataset: {evaluation_results}")

In [None]:
test_df_features_processor = DataFrameStatsProcessor(test_df)
test_df_features_a, test_df_features_b = test_df_features_processor.process_dataframe()

In [None]:
test_texts = test_df.options.tolist()
test_ds = build_dataset_with_features(test_texts, features_a=test_df_features_a, features_b=test_df_features_b,
                         batch_size=min(len(test_df), CFG.batch_size),
                         shuffle=False)
print(test_ds)

In [None]:
test_preds = model.predict(test_ds, verbose=1)

# Submission

Following code will prepare the submission file.

In [None]:
sub_df = test_df[["id"]].copy()
sub_df[CFG.class_names] = test_preds.tolist()
sub_df.to_csv("submission.csv", index=False)
sub_df.head()