<a href="https://www.kaggle.com/code/stardewcvalley/llm-class-ft-7-optimized-py?scriptVersionId=231454760" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
import gc
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_FLAGS"] = "--xla_gpu_strict_conv_algorithm_picker=false"

import keras_nlp
import keras
import tensorflow as tf
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.metrics import Precision, Recall

In [None]:
print("TensorFlow:", tf.__version__)
print("Keras:", keras.__version__)
print("KerasNLP:", keras_nlp.__version__)

In [None]:
class CFG:
    seed = 42
    preset = "deberta_v3_extra_small_en"  
    max_seq_length = 512
    epochs = 3
    batch_size = 8  # Reduced from 16 to save memory
    gradient_accumulation_steps = 2  # Use gradient accumulation to maintain effective batch size
    scheduler = 'cosine'  
    label2name = {0: 'winner_model_a', 1: 'winner_model_b', 2: 'winner_tie'}
    name2label = {v:k for k, v in label2name.items()}
    class_labels = list(label2name.keys())
    class_names = list(label2name.values())

In [None]:
# Clear any existing sessions for memory cleanup
keras.backend.clear_session()
tf.keras.backend.clear_session()
gc.collect()

In [None]:
strategy = tf.distribute.get_strategy()
print("Running on CPU/GPU")

In [None]:
keras.utils.set_random_seed(CFG.seed)

In [None]:
# Use mixed precision for memory efficiency
keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
BASE_PATH = '/kaggle/input/llm-classification-finetuning'

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    df["prompt"] = df.prompt.map(lambda x: eval(x)[0])  # First prompt
    df["response_a"] = df.response_a.map(lambda x: eval(x.replace("null","''"))[0])
    df["response_b"] = df.response_b.map(lambda x: eval(x.replace("null","''"))[0])
    return df

In [None]:
df = load_data(f'{BASE_PATH}/train.csv')
test_df = load_data(f'{BASE_PATH}/test.csv')

In [None]:
# Label conversion
df["class_name"] = df[["winner_model_a", "winner_model_b", "winner_tie"]].idxmax(axis=1)
df["class_label"] = df.class_name.map(CFG.name2label)

In [None]:
def swap_responses(row):
    if np.random.rand() > 0.5:
        row["response_a"], row["response_b"] = row["response_b"], row["response_a"]
        if row["class_name"] == "winner_model_a":
            row["class_name"] = "winner_model_b"
        elif row["class_name"] == "winner_model_b":
            row["class_name"] = "winner_model_a"
    row["class_label"] = CFG.name2label[row["class_name"]]
    return row

In [None]:
preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
    preset=CFG.preset,
    sequence_length=CFG.max_seq_length,
)

In [None]:
def preprocess_fn(text, label=None):
    text = preprocessor(text)
    return (text, label) if label is not None else text

In [None]:
def make_pairs(row):
    row["encode_fail"] = False
    try:
        prompt = row.prompt.encode("utf-8").decode("utf-8")
    except:
        prompt = ""
        row["encode_fail"] = True

    try:
        response_a = row.response_a.encode("utf-8").decode("utf-8")
    except:
        response_a = ""
        row["encode_fail"] = True

    try:
        response_b = row.response_b.encode("utf-8").decode("utf-8")
    except:
        response_b = ""
        row["encode_fail"] = True
        
    row['options'] = [
        f"Prompt: {prompt}\n\nResponse: {response_a}",
        f"Prompt: {prompt}\n\nResponse: {response_b}"
    ]
    return row

In [None]:
df = df.apply(make_pairs, axis=1)
test_df = test_df.apply(make_pairs, axis=1)

In [None]:
def build_dataset(texts, labels=None, batch_size=32, cache=True, shuffle=1024):
    AUTO = tf.data.AUTOTUNE
    slices = (texts,) if labels is None else (texts, keras.utils.to_categorical(labels, num_classes=3))
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.cache() if cache else ds
    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)
    opt = tf.data.Options()
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=CFG.seed)
        opt.experimental_deterministic = False
    ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=False)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
def get_lr_callback(batch_size=8, mode='cosine', epochs=10, plot=False):
    # Scale learning rate with effective batch size
    effective_batch_size = batch_size * CFG.gradient_accumulation_steps
    
    # Scale learning rate with batch size (linear scaling rule)
    lr_start = 1.0e-6
    lr_max = 1.0e-6 * (effective_batch_size / 8)  # Scale up for larger batches
    lr_min = 1.0e-7  # Lower minimum LR
    
    # Warmup and sustain periods
    lr_ramp_ep = 2  # Epochs for warmup
    lr_sus_ep = 0   # Epochs to sustain peak LR
    
    def lrfn(epoch):
        # Warmup phase
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        # Sustain phase
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        # Decay phase
        else:
            decay_total_epochs = max(1, epochs - lr_ramp_ep - lr_sus_ep)  # Avoid division by zero
            decay_epoch = epoch - lr_ramp_ep - lr_sus_ep
            decay_ratio = decay_epoch / decay_total_epochs
            
            if mode == 'cosine':
                # Proper cosine decay
                import math
                lr = lr_min + 0.5 * (lr_max - lr_min) * (1 + math.cos(math.pi * decay_ratio))
            elif mode == 'linear':
                # Linear decay
                lr = lr_max - (lr_max - lr_min) * decay_ratio
            else:  # constant or any other mode
                lr = lr_max
                
        return lr

    # Plot the learning rate schedule if requested
    if plot:
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)])
        plt.title(f'Learning Rate Schedule ({mode})')
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.grid(True)
        plt.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)

In [None]:
def build_model():
    # Clear memory before building model
    gc.collect()
    
    # Use dynamic sequence lengths to save memory
    inputs = {
        "token_ids": keras.Input(shape=(2, None), dtype=tf.int32, name="token_ids"),
        "padding_mask": keras.Input(shape=(2, None), dtype=tf.int32, name="padding_mask"),
    }
    
    # Create backbone outside strategy scope
    backbone = keras_nlp.models.DebertaV3Backbone.from_preset(CFG.preset)
    
    with strategy.scope():
        # Shared weights for both responses
        response_a = {k: v[:, 0, :] for k, v in inputs.items()}
        embed_a = backbone(response_a)
        response_b = {k: v[:, 1, :] for k, v in inputs.items()}
        embed_b = backbone(response_b)
        
        # Use memory-efficient operations
        embeds = keras.layers.Concatenate(axis=-1)([embed_a, embed_b])
        embeds = keras.layers.GlobalAveragePooling1D()(embeds)
        embeds = keras.layers.Dropout(0.3)(embeds)  # Keep dropout for regularization
        
        outputs = keras.layers.Dense(
            3, 
            activation="softmax", 
            kernel_regularizer=keras.regularizers.l2(1e-4)
        )(embeds)
        
        model = keras.Model(inputs, outputs)
        model.compile(
            optimizer=keras.optimizers.Adam(5e-6),
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.02),
            metrics=[keras.metrics.CategoricalAccuracy(name="accuracy")]
        )
    return model

In [None]:
# Free up memory before splitting data
gc.collect()

train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df["class_label"])
train_df = train_df.apply(swap_responses, axis=1)

In [None]:
# Create datasets with smaller batch size
train_ds = build_dataset(train_df.options.tolist(), train_df.class_label.tolist(), batch_size=CFG.batch_size)
valid_ds = build_dataset(valid_df.options.tolist(), valid_df.class_label.tolist(), batch_size=CFG.batch_size)

# Release dataframes from memory if no longer needed
del train_df, valid_df
gc.collect()

In [None]:
# Build model
model = build_model()
lr_cb = get_lr_callback(CFG.batch_size, mode=CFG.scheduler, epochs=CFG.epochs, plot=True)  
ckpt_cb = keras.callbacks.ModelCheckpoint(
    'best_model.weights.h5', 
    monitor='val_loss',
    save_weights_only=True,
    save_best_only=True
)

In [None]:
# Train model
history = model.fit( 
    train_ds,
    validation_data=valid_ds,
    epochs=CFG.epochs,
    callbacks=[lr_cb, ckpt_cb]
)

In [None]:
# Clear memory before prediction
gc.collect()

# Create test dataset with small batch size
test_ds = build_dataset(test_df.options.tolist(), batch_size=CFG.batch_size)

# Load best weights
model.load_weights('best_model.weights.h5')

# Make predictions
final_preds = model.predict(test_ds)

In [None]:
# Create submission
sub_df = test_df[["id"]].copy()
sub_df[CFG.class_names] = final_preds
sub_df.head()
sub_df.to_csv("submission.csv", index=False)