In [1]:
import kagglehub

# Download latest version
path = kagglehub.model_download("keras/deberta_v3/keras/deberta_v3_extra_small_en")

print("Path to model files:", path)


Path to model files: /kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2/config.json
/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2/tokenizer.json
/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2/metadata.json
/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2/model.weights.h5
/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2/assets/tokenizer/vocabulary.spm
/kaggle/input/deberta_v3/keras/deberta_v3_small_en/2/config.json
/kaggle/input/deberta_v3/keras/deberta_v3_small_en/2/tokenizer.json
/kaggle/input/deberta_v3/keras/deberta_v3_small_en/2/metadata.json
/kaggle/input/deberta_v3/keras/deberta_v3_small_en/2/model.weights.h5
/kaggle/input/deberta_v3/keras/deberta_v3_small_en/2/assets/tokenizer/vocabulary.spm
/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [17]:
import os 
os.environ['KERAS_BACKGROUND'] = 'tensorflow'

import keras_nlp
import keras
import tensorflow as tf
import numpy as np
import pandas as pd 
from tqdm import tqdm
import matplotlib.pyplot as plt 
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

  plt.style.use('seaborn')


In [14]:
class ModelConfig:
    # Model architecture settings
    backbone_type = "deberta_v3_small_en"
    max_seq_length = 128  # Reduced from 256
    training_batch = 64   # Increased from 32
    training_cycles = 2   # Reduced from 3
    
    # Training parameters
    initial_learning_rate = 1e-4  # Increased for faster convergence
    min_learning_rate = 1e-5
    dropout_rate = 0.1
    attention_heads = 8
    dense_dim = 512      # Reduced from 768
    
    # Performance settings
    mixed_precision = True
    num_folds = 2       # Reduced from 3
    
    target_classes = ["winner_model_a", "winner_model_b", "winner_tie"]
    

In [5]:
class DataProcessor:
    @staticmethod
    def load_datasets(base_path="/kaggle/input/llm-classification-finetuning/"):
        """Load and perform initial data processing"""
        train_data = pd.read_csv(f"{base_path}train.csv")
        test_data = pd.read_csv(f"{base_path}test.csv")
        return train_data, test_data
   
    @staticmethod
    def process_text_pair(row):
        """Process prompt and response pairs with error handling"""
        try:
            clean_prompt = row.prompt.encode("utf-8").decode("utf-8")
            clean_resp_a = row.response_a.encode("utf-8").decode("utf-8")
            clean_resp_b = row.response_b.encode("utf-8").decode("utf-8")
            
            row['text_pairs'] = [
                f"Question: {clean_prompt}\nAnswer: {clean_resp_a}",
                f"Question: {clean_prompt}\nAnswer: {clean_resp_b}"
            ]
            row['processing_error'] = False
        except:
            row['text_pairs'] = ["", ""]
            row['processing_error'] = True
        return row

# Load and process data
train_df, test_df = DataProcessor.load_datasets()
train_df = train_df.apply(DataProcessor.process_text_pair, axis=1)
test_df = test_df.apply(DataProcessor.process_text_pair, axis=1)

# Remove failed processing rows
train_df = train_df[~train_df['processing_error']]

In [6]:
class LabelProcessor:
    @staticmethod
    def create_label_encoding(row):
        """Convert multi-column labels to single class label"""
        if row['winner_model_b'] == 1:
            return 1
        elif row['winner_tie'] == 1:
            return 2
        return 0  # Default: model_a wins

class DataAugmenter:
    @staticmethod
    def swap_responses(df, swap_probability=0.5):
        """Augment data by swapping responses with probability"""
        augmented = df.copy()
        swap_mask = np.random.rand(len(df)) < swap_probability
        
        # Swap responses and adjust labels
        augmented.loc[swap_mask, ['response_a', 'response_b']] = \
            augmented.loc[swap_mask, ['response_b', 'response_a']].values
        
        # Update labels for swapped entries (only for binary outcomes)
        binary_mask = augmented['class_label'].isin([0, 1])
        augmented.loc[swap_mask & binary_mask, 'class_label'] = \
            1 - augmented.loc[swap_mask & binary_mask, 'class_label']
        
        return pd.concat([df, augmented], ignore_index=True)

# Process labels and augment data
train_df['class_label'] = train_df.apply(LabelProcessor.create_label_encoding, axis=1)
train_df = DataAugmenter.swap_responses(train_df)

In [7]:
class TextPreprocessor:
    def __init__(self, config):
        self.preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
            preset=config.backbone_type,
            sequence_length=config.max_seq_length
        )
    
    def __call__(self, text, label=None):
        """Process text and optionally pair with label"""
        processed_text = self.preprocessor(text)
        return (processed_text, label) if label is not None else processed_text

class DatasetBuilder:
    def __init__(self, config, preprocessor):
        self.config = config
        self.preprocessor = preprocessor
        
    def build(self, texts, labels=None, shuffle=True, cache=True):
        """Build TensorFlow dataset with preprocessing"""
        AUTO = tf.data.AUTOTUNE
        
        # Prepare data slices
        if labels is not None:
            labels = keras.utils.to_categorical(labels, num_classes=3)
            slices = (texts, labels)
        else:
            slices = (texts,)
            
        # Create and configure dataset
        dataset = tf.data.Dataset.from_tensor_slices(slices)
        
        # Enable parallel processing
        options = tf.data.Options()
        options.experimental_distribute.auto_shard_policy = \
            tf.data.experimental.AutoShardPolicy.DATA
        dataset = dataset.with_options(options)
        
        if cache:
            dataset = dataset.cache()
        dataset = dataset.map(self.preprocessor, 
                            num_parallel_calls=AUTO)
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000)
        return dataset.batch(self.config.training_batch).prefetch(AUTO)

# Initialize preprocessor and dataset builder
preprocessor = TextPreprocessor(ModelConfig)
dataset_builder = DatasetBuilder(ModelConfig, preprocessor)

In [8]:
class LLMClassifier:
    def __init__(self, config):
        self.config = config
        
    def build(self):
        # Input layers with correct naming
        inputs = {
            "token_ids": keras.Input(shape=(2, None), dtype=tf.int32, name="token_ids"),
            "padding_mask": keras.Input(shape=(2, None), dtype=tf.int32, name="padding_mask")
        }
        
        # Initialize backbone with reduced parameters
        backbone = keras_nlp.models.DebertaV3Backbone.from_preset(
            preset=self.config.backbone_type,
            max_sequence_length=self.config.max_seq_length
        )
        
        # Process both responses
        response_embeddings = []
        for i in range(2):
            response = {k: v[:,i,:] for k,v in inputs.items()}
            response_embeddings.append(backbone(response))
        
        # Simplified architecture
        combined = keras.layers.Concatenate(axis=-1)(response_embeddings)
        x = keras.layers.GlobalAveragePooling1D()(combined)
        
        # Reduced dense layers
        x = keras.layers.Dense(self.config.dense_dim, activation="relu")(x)
        x = keras.layers.Dropout(self.config.dropout_rate)(x)
        x = keras.layers.Dense(self.config.dense_dim // 2, activation="relu")(x)
        x = keras.layers.Dropout(self.config.dropout_rate)(x)
        
        outputs = keras.layers.Dense(3, activation="softmax")(x)
        return keras.Model(inputs, outputs)

In [9]:
class FocalLoss(keras.losses.Loss):
    def __init__(self, gamma=2.0, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
    
    def call(self, y_true, y_pred):
        epsilon = 1e-9
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        
        # Calculate focal loss
        cross_entropy = -y_true * tf.math.log(y_pred)
        weight = tf.pow(1. - y_pred, self.gamma) * y_true
        focal = self.alpha * weight * cross_entropy
        
        return tf.reduce_mean(tf.reduce_sum(focal, axis=-1))

class LearningRateScheduler:
    @staticmethod
    def cosine_decay_with_warmup(epoch, config):
        """Cosine decay schedule with warmup"""
        cycle_length = 2
        cycle = np.floor(1 + epoch / cycle_length)
        x = np.abs(epoch / cycle_length - cycle)
        return config.min_learning_rate + \
               (config.initial_learning_rate - config.min_learning_rate) * \
               max(0, (1 - x))

In [10]:
class ModelTrainer:
    def __init__(self, config):
        self.config = config
        self.kfold = StratifiedKFold(
            n_splits=config.num_folds, 
            shuffle=True, 
            random_state=42
        )
        
    def train_fold(self, fold, train_data, val_data):
        # Initialize model
        model = LLMClassifier(self.config).build()
        
        # Use AMP (Automatic Mixed Precision)
        if self.config.mixed_precision:
            tf.keras.mixed_precision.set_global_policy("mixed_float16")
        
        # Compile with optimized settings
        model.compile(
            optimizer=keras.optimizers.AdamW(
                learning_rate=self.config.initial_learning_rate,
                weight_decay=0.01
            ),
            loss=FocalLoss(alpha=0.25, gamma=2.0),
            metrics=['accuracy'],
            jit_compile=True  # Enable XLA compilation
        )
        
        # Streamlined callbacks
        callbacks = [
            keras.callbacks.ModelCheckpoint(
                f'model_fold_{fold}.weights.h5',
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True,
                verbose=0
            ),
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=1,
                restore_best_weights=True,
                verbose=1
            )
        ]
        
        # Train with reduced verbosity
        history = model.fit(
            train_data,
            epochs=self.config.training_cycles,
            validation_data=val_data,
            callbacks=callbacks,
            verbose=1
        )
        
        return model, history

# Training execution
trainer = ModelTrainer(ModelConfig)
models = []

# Use smaller subset for training if needed
train_sample = train_df.sample(frac=0.8, random_state=42)  # Use 80% of data

for fold, (train_idx, val_idx) in enumerate(trainer.kfold.split(
    train_sample, train_sample['class_label'])):
    print(f"\nFold {fold + 1}/{ModelConfig.num_folds}")
    
    # Prepare fold data
    train_fold = dataset_builder.build(
        train_sample.iloc[train_idx]['text_pairs'].tolist(),
        train_sample.iloc[train_idx]['class_label'].tolist()
    )
    val_fold = dataset_builder.build(
        train_sample.iloc[val_idx]['text_pairs'].tolist(),
        train_sample.iloc[val_idx]['class_label'].tolist()
    )
    
    model, _ = trainer.train_fold(fold, train_fold, val_fold)
    models.append(model)


Fold 1/2
Epoch 1/2


I0000 00:00:1730399112.349469     113 service.cc:145] XLA service 0x7cf574008440 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730399112.349538     113 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1730399112.349544     113 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


I0000 00:00:1730399253.004110     113 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m718/719[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m2s[0m 3s/step - accuracy: 0.3561 - loss: 0.1299






[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.3561 - loss: 0.1299




[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2630s[0m 3s/step - accuracy: 0.3561 - loss: 0.1299 - val_accuracy: 0.3853 - val_loss: 0.1204
Epoch 2/2
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2302s[0m 3s/step - accuracy: 0.3919 - loss: 0.1203 - val_accuracy: 0.4228 - val_loss: 0.1172
Restoring model weights from the end of the best epoch: 2.

Fold 2/2
Epoch 1/2





[1m718/719[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 861ms/step - accuracy: 0.3510 - loss: 0.1286







[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.3510 - loss: 0.1286   






[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1081s[0m 1s/step - accuracy: 0.3510 - loss: 0.1286 - val_accuracy: 0.3767 - val_loss: 0.1214
Epoch 2/2
[1m719/719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 1s/step - accuracy: 0.3759 - loss: 0.1214 - val_accuracy: 0.3827 - val_loss: 0.1221
Epoch 2: early stopping
Restoring model weights from the end of the best epoch: 1.


In [19]:
class Predictor:
    def __init__(self, config):
        self.config = config
        self.target_classes = config.target_classes
        
    def analyze_confidence_levels(self, predictions, submission):
        """Analyze predictions at different confidence thresholds"""
        confidence_thresholds = [0.33, 0.4, 0.45, 0.5]
        
        print("\n=== Confidence Analysis ===")
        print("\nConfidence Distribution:")
        
        # Analyze different confidence thresholds
        for threshold in confidence_thresholds:
            print(f"\nPredictions with confidence > {threshold:.2f}:")
            for cls in self.target_classes:
                confident_preds = (submission[cls] > threshold).sum()
                percentage = confident_preds/len(submission)*100
                print(f"{cls}: {confident_preds} predictions ({percentage:.2f}%)")
        
        # Find maximum prediction for each sample
        max_confidences = np.max(predictions, axis=1)
        
        print("\nConfidence Statistics:")
        print(f"Mean confidence: {max_confidences.mean():.3f}")
        print(f"Median confidence: {np.median(max_confidences):.3f}")
        print(f"Max confidence: {max_confidences.max():.3f}")
        print(f"Min confidence: {max_confidences.min():.3f}")
        
        # Analyze class-wise predictions
        print("\nClass-wise Maximum Probabilities:")
        for i, cls in enumerate(self.target_classes):
            class_max = np.max(predictions[:, i])
            class_mean = np.mean(predictions[:, i])
            print(f"{cls}:")
            print(f"  Max probability: {class_max:.3f}")
            print(f"  Mean probability: {class_mean:.3f}")
    
    def plot_training_evaluation(self, predictions, submission):
        """Enhanced visualization of model predictions"""
        plt.style.use('seaborn')
        fig = plt.figure(figsize=(20, 15))
        
        # 1. Enhanced Class Distribution Plot
        plt.subplot(3, 2, 1)
        class_means = [submission[cls].mean() for cls in self.target_classes]
        bars = plt.bar(self.target_classes, class_means)
        plt.title('Mean Prediction Distribution', fontsize=12)
        plt.xticks(rotation=45)
        plt.ylabel('Mean Probability')
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.3f}',
                    ha='center', va='bottom')
        
        # 2. Prediction Density Plot
        plt.subplot(3, 2, 2)
        for cls in self.target_classes:
            sns.kdeplot(data=submission[cls], label=cls)
        plt.title('Prediction Density Distribution', fontsize=12)
        plt.xlabel('Prediction Value')
        plt.ylabel('Density')
        plt.legend()
        
        # 3. Correlation Heatmap
        plt.subplot(3, 2, 3)
        correlation = submission[self.target_classes].corr()
        sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
        plt.title('Prediction Correlation Matrix', fontsize=12)
        
        # 4. Confidence Distribution
        plt.subplot(3, 2, 4)
        max_probs = np.max(predictions, axis=1)
        plt.hist(max_probs, bins=50, edgecolor='black')
        plt.axvline(x=0.33, color='r', linestyle='--', label='33% threshold')
        plt.axvline(x=0.5, color='g', linestyle='--', label='50% threshold')
        plt.title('Model Confidence Distribution', fontsize=12)
        plt.xlabel('Maximum Prediction Probability')
        plt.ylabel('Count')
        plt.legend()
        
        # 5. Prediction Scatter Plot
        plt.subplot(3, 2, 5)
        plt.scatter(range(len(predictions)), max_probs, alpha=0.5)
        plt.axhline(y=0.33, color='r', linestyle='--', label='33% threshold')
        plt.axhline(y=0.5, color='g', linestyle='--', label='50% threshold')
        plt.title('Prediction Confidence by Sample', fontsize=12)
        plt.xlabel('Sample Index')
        plt.ylabel('Maximum Prediction Probability')
        plt.legend()
        
        # 6. Class-wise Box Plot
        plt.subplot(3, 2, 6)
        sns.boxplot(data=submission[self.target_classes])
        plt.title('Class-wise Prediction Distribution', fontsize=12)
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.savefig('model_evaluation.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # Print detailed analysis
        self.analyze_confidence_levels(predictions, submission)

    # ... rest of the Predictor class remains the same ...
    
    def print_evaluation_metrics(self, submission):
        """Print detailed evaluation metrics"""
        print("\n=== Detailed Evaluation Metrics ===")
        
        # Basic statistics
        print("\nClass-wise Statistics:")
        stats = submission[self.target_classes].describe()
        print(stats)
        
        # Confidence metrics
        print("\nConfidence Metrics:")
        for cls in self.target_classes:
            confident_preds = (submission[cls] > 0.5).sum()
            print(f"{cls}: {confident_preds} confident predictions "
                  f"({confident_preds/len(submission)*100:.2f}%)")
    
    def predict_single_model(self, model, test_data):
        try:
            predictions = model.predict(
                test_data,
                batch_size=self.config.training_batch,
                verbose=1
            )
            return predictions
        except Exception as e:
            print(f"Error in model prediction: {str(e)}")
            return None
    
    def ensemble_predict(self, models, test_data):
        print(f"\nGenerating predictions using {len(models)} models...")
        
        all_predictions = []
        
        for i, model in enumerate(models, 1):
            print(f"\nPredicting with model {i}/{len(models)}")
            model_preds = self.predict_single_model(model, test_data)
            
            if model_preds is not None:
                all_predictions.append(model_preds)
        
        if not all_predictions:
            raise ValueError("No valid predictions were generated!")
        
        final_predictions = np.mean(all_predictions, axis=0)
        return final_predictions

# Usage code
try:
    config = ModelConfig()
    predictor = Predictor(config)
    
    print("Generating predictions...")
    predictions = predictor.ensemble_predict(models, test_dataset)
    
    # Normalize predictions if needed
    predictions = predictions / predictions.sum(axis=1, keepdims=True)
    
    submission = pd.DataFrame({
        'id': test_df['id'],
        **{class_name: predictions[:, i] 
           for i, class_name in enumerate(config.target_classes)}
    })
    
    # Generate enhanced evaluation plots and metrics
    predictor.plot_training_evaluation(predictions, submission)
    
    # Save submission with normalized predictions
    submission.to_csv('submission.csv', index=False)
    print("\nSubmission file created successfully!")
    
except Exception as e:
    print(f"\nError in prediction process: {str(e)}")
    raise

finally:
    print("\nPrediction process completed.")

Generating predictions...

Generating predictions using 2 models...

Predicting with model 1/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step

Predicting with model 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step


  plt.style.use('seaborn')
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):



=== Confidence Analysis ===

Confidence Distribution:

Predictions with confidence > 0.33:
winner_model_a: 0 predictions (0.00%)
winner_model_b: 2 predictions (66.67%)
winner_tie: 2 predictions (66.67%)

Predictions with confidence > 0.40:
winner_model_a: 0 predictions (0.00%)
winner_model_b: 0 predictions (0.00%)
winner_tie: 1 predictions (33.33%)

Predictions with confidence > 0.45:
winner_model_a: 0 predictions (0.00%)
winner_model_b: 0 predictions (0.00%)
winner_tie: 1 predictions (33.33%)

Predictions with confidence > 0.50:
winner_model_a: 0 predictions (0.00%)
winner_model_b: 0 predictions (0.00%)
winner_tie: 0 predictions (0.00%)

Confidence Statistics:
Mean confidence: 0.393
Median confidence: 0.371
Max confidence: 0.457
Min confidence: 0.352

Class-wise Maximum Probabilities:
winner_model_a:
  Max probability: 0.329
  Mean probability: 0.311
winner_model_b:
  Max probability: 0.371
  Mean probability: 0.326
winner_tie:
  Max probability: 0.457
  Mean probability: 0.363

Subm