### Fine-tune and Test RoBERTa or DeBERTa (or any other BERT variant)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, AutoConfig
)
import json
import yaml
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, precision_score, recall_score
from sklearn.model_selection import KFold
import logging
from typing import Dict, List, Tuple, Optional
import os
from dataclasses import dataclass
import uuid
import wandb
wandb.login(key="your-key")

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpinkfloat[0m ([33mpinkfloat-berliner-hochschule-f-r-technik[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import transformers
print(transformers.__version__)

4.41.2


#### Data Paths

In [3]:
train_vehicles_path = "../../data/train_vehicles_info.yaml"
train_queries_path = "../../data/train_generated_questions.json"
test_vehicles_path = "../../data/test_vehicles_info.yaml"
test_queries_path = "../../data/test_generated_questions.json"

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

#### Settings for Training

In [4]:
@dataclass
class ModelConfig:
    """Configuration for the fine-tuning process"""
    model_name: str = "microsoft/deberta-v3-base"  # or "roberta-base"
    max_length: int = 512
    learning_rate: float = 2e-5
    num_epochs: int = 3
    batch_size: int = 16
    warmup_steps: int = 100
    weight_decay: float = 0.01
    output_dir: str = "./deberta"
    save_steps: int = 500
    eval_steps: int = 500
    logging_steps: int = 100


config = ModelConfig()

# Start main W&B run manually
random_id = uuid.uuid4().hex[:6] # e.g., 'a7c3d9'
wandb_group = f"5-fold-cv-deberta-{random_id}"
wandb_project = "vehicle-cross-encoder"

main_run = wandb.init(
    project=wandb_project,
    name=wandb_group + "-main",
    group=wandb_group,
    config=config.__dict__,
    reinit=True
)

run_name = wandb.run.name
config.output_dir = os.path.join(config.output_dir, run_name)



In [5]:
class VehicleDataset(Dataset):
    """Dataset class for vehicle-query pairs"""
    
    def __init__(self, data_pairs: List[Tuple[str, str, int]], tokenizer, max_length: int = 512):
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data_pairs)
    
    def __getitem__(self, idx):
        query, vehicle_text, label = self.data_pairs[idx]
        
        # Tokenize the query-vehicle pair
        # Following cross-encoder format: [CLS] query [SEP] vehicle_text [SEP]
        encoding = self.tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
class VehicleCrossEncoder:
    """Main class for fine-tuning cross-encoder models"""
    
    def __init__(self, config: ModelConfig):
        self.config = config
        self.tokenizer = None
        self.model = None
        self.trainer = None

    def load_training_data(self, train_vehicles_file: str, train_questions_file: str, num_folds: int = 5) -> List[Tuple[List, List]]:
        """Load training data and return 5-fold cross-validation splits"""
        
        # Load training vehicle data
        with open(train_vehicles_file, 'r', encoding='utf-8') as f:
            vehicles_data = yaml.safe_load(f)

        # Load training questions data
        with open(train_questions_file, 'r', encoding='utf-8') as f:
            questions_data = json.load(f)

        # Prepare vehicle-level data
        vehicle_datasets = []
        for vehicle_url, vehicle_info in vehicles_data.items():
            vehicle_text = self._create_vehicle_description(vehicle_info)
            if vehicle_url in questions_data:
                questions = questions_data[vehicle_url]
                vehicle_pairs = [(q, vehicle_text, int(label)) for q, label in questions.items()]
                vehicle_datasets.append({'url': vehicle_url, 'pairs': vehicle_pairs})

        # Prepare non-overlapping folds from all training vehicles
        import random
        random.seed(42)
        random.shuffle(vehicle_datasets)
        
        # Calculate fold sizes
        total_vehicles = len(vehicle_datasets)
        fold_size = total_vehicles // num_folds
        remainder = total_vehicles % num_folds
        
        folds = []
        start_idx = 0
        
        for fold_idx in range(num_folds):
            # Calculate fold size (distribute remainder across first few folds)
            current_fold_size = fold_size + (1 if fold_idx < remainder else 0)
            
            # Get validation vehicles for this fold
            val_vehicles = vehicle_datasets[start_idx:start_idx + current_fold_size]
            
            # Get training vehicles (all others)
            train_vehicles = vehicle_datasets[:start_idx] + vehicle_datasets[start_idx + current_fold_size:]
            
            # Convert to pairs
            train_pairs = []
            val_pairs = []
            
            for vehicle in train_vehicles:
                train_pairs.extend(vehicle['pairs'])
            for vehicle in val_vehicles:
                val_pairs.extend(vehicle['pairs'])
            
            random.shuffle(train_pairs)
            folds.append((train_pairs, val_pairs))
            
            logger.info(f"Fold {fold_idx + 1}: {len(train_vehicles)} train vehicles, {len(val_vehicles)} val vehicles")
            logger.info(f"  → Train pairs: {len(train_pairs)}")
            logger.info(f"  → Val pairs:   {len(val_pairs)}")
            
            start_idx += current_fold_size

        return folds

    def load_test_data(self, test_vehicles_file: str, test_questions_file: str) -> List:
        """Load test data and return test pairs"""
        
        # Load test vehicle data
        with open(test_vehicles_file, 'r', encoding='utf-8') as f:
            vehicles_data = yaml.safe_load(f)

        # Load test questions data
        with open(test_questions_file, 'r', encoding='utf-8') as f:
            questions_data = json.load(f)

        # Prepare test pairs
        test_pairs = []
        for vehicle_url, vehicle_info in vehicles_data.items():
            vehicle_text = self._create_vehicle_description(vehicle_info)
            if vehicle_url in questions_data:
                questions = questions_data[vehicle_url]
                vehicle_pairs = [(q, vehicle_text, int(label)) for q, label in questions.items()]
                test_pairs.extend(vehicle_pairs)

        logger.info(f"Test set: {len(vehicles_data)} vehicles → {len(test_pairs)} pairs")
        
        return test_pairs
    
    def _create_vehicle_description(self, vehicle_info: Dict) -> str:
        """Create a comprehensive vehicle description from the data"""
        description_parts = []
        
        # Add information dictionary details
        if 'information_dict' in vehicle_info:
            info_dict = vehicle_info['information_dict']
            for key, value in info_dict.items():
                description_parts.append(f"{key}: {value}")
        
        # Add details list
        if 'details_list' in vehicle_info:
            details = " | ".join(vehicle_info['details_list'])
            description_parts.append(details)
        
        # Add details text if available
        if 'details_text' in vehicle_info:
            description_parts.append(vehicle_info['details_text'])
        
        return " | ".join(description_parts)
    
    def initialize_model(self):
        """Initialize tokenizer and model"""
        logger.info(f"Loading model: {self.config.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        
        # Load model configuration and modify for binary classification
        model_config = AutoConfig.from_pretrained(self.config.model_name)
        model_config.num_labels = 2  # Binary classification
        
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            config=model_config,
            ignore_mismatched_sizes=True 
        )
        
        logger.info("Model and tokenizer loaded successfully")
    
    def compute_metrics(self, eval_pred):
        """Compute metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    def train(self, test_pairs: List, folds: List[Tuple[List, List]]):
        """Train and evaluate the model using k-fold cross-validation and a held-out test set."""
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model not initialized. Call initialize_model() first.")
        
        all_test_results = []

        for fold_idx, (train_pairs, val_pairs) in enumerate(folds):
            logger.info(f"\n====== Fold {fold_idx + 1} / {len(folds)} ======")
            logger.info(f"Train size: {len(train_pairs)} | Validation size: {len(val_pairs)}")

            # Create datasets
            train_dataset = VehicleDataset(train_pairs, self.tokenizer, self.config.max_length)
            val_dataset = VehicleDataset(val_pairs, self.tokenizer, self.config.max_length)
            test_dataset = VehicleDataset(test_pairs, self.tokenizer, self.config.max_length)

            # Set up fold-specific output directory
            fold_output_dir = os.path.join(self.config.output_dir, f"fold_{fold_idx + 1}")
            os.makedirs(fold_output_dir, exist_ok=True)

            training_args = TrainingArguments(
                output_dir=fold_output_dir,
                num_train_epochs=self.config.num_epochs,
                per_device_train_batch_size=self.config.batch_size,
                per_device_eval_batch_size=self.config.batch_size,
                learning_rate=self.config.learning_rate,
                weight_decay=self.config.weight_decay,
                warmup_steps=self.config.warmup_steps,
                logging_steps=self.config.logging_steps,
                evaluation_strategy="steps",
                eval_steps=self.config.eval_steps,
                save_steps=self.config.save_steps,
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                save_total_limit=2,
                report_to="wandb",
                run_name=f"{wandb_group}-fold-{fold_idx + 1}"
            )

            fold_run = wandb.init(
                project=wandb_project,
                name=f"{wandb_group}-fold-{fold_idx + 1}",
                group=wandb_group,
                config=config.__dict__,
                reinit=True
            )

            # Reinitialize trainer (model stays the same unless you want to reinit for each fold)
            self.trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=self.compute_metrics,
            )

            # Train
            logger.info("Starting training...")
            self.trainer.train()

            # Evaluate on test set
            logger.info("Evaluating on held-out test set...")
            test_results = self.trainer.evaluate(test_dataset)
            logger.info(f"Fold {fold_idx + 1} test results: {test_results}")
            all_test_results.append(test_results)

            # Save model and tokenizer
            self.trainer.save_model()
            self.tokenizer.save_pretrained(fold_output_dir)

            # Finish wandb fold run
            fold_run.finish()

        logger.info("All folds completed.")
        main_run.finish() # finish wandb main run
        return all_test_results

    def predict(self, query: str, vehicle_text: str) -> Tuple[float, int]:
        """Make a prediction for a query-vehicle pair"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model not initialized or trained.")
        
        # Tokenize input
        encoding = self.tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=self.config.max_length,
            return_tensors='pt'
        )
        
        # Make prediction
        with torch.no_grad():
            outputs = self.model(**encoding)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
        # Get probability of positive class (index 1)
        positive_prob = predictions[0][1].item()
        predicted_label = int(positive_prob > 0.5)
        
        return positive_prob, predicted_label



In [7]:
# Initialize the cross-encoder
cross_encoder = VehicleCrossEncoder(config)

# Load training data for cross-validation
folds = cross_encoder.load_training_data(
    train_vehicles_file=train_vehicles_path,
    train_questions_file=train_queries_path
)

# Load test data for final evaluation
test_pairs = cross_encoder.load_test_data(
    test_vehicles_file=test_vehicles_path,
    test_questions_file=test_queries_path
)

INFO:__main__:Fold 1: 376 train vehicles, 95 val vehicles
INFO:__main__:  → Train pairs: 3760
INFO:__main__:  → Val pairs:   950
INFO:__main__:Fold 2: 377 train vehicles, 94 val vehicles
INFO:__main__:  → Train pairs: 3770
INFO:__main__:  → Val pairs:   940
INFO:__main__:Fold 3: 377 train vehicles, 94 val vehicles
INFO:__main__:  → Train pairs: 3770
INFO:__main__:  → Val pairs:   940
INFO:__main__:Fold 4: 377 train vehicles, 94 val vehicles
INFO:__main__:  → Train pairs: 3770
INFO:__main__:  → Val pairs:   940
INFO:__main__:Fold 5: 377 train vehicles, 94 val vehicles
INFO:__main__:  → Train pairs: 3770
INFO:__main__:  → Val pairs:   940
INFO:__main__:Test set: 82 vehicles → 820 pairs


**About the train / val / test split:**  
Each set contains the complete vehicle_info x 10 corresponding queries combinations to avoid data leakage about the cars between the sets.

In [8]:
def preview_cross_encoder_inputs():
    # Preview function
    def print_preview(pairs, set_name, fold_idx=None):
        header = f"\n=== Preview from {set_name.upper()} Set"
        if fold_idx is not None:
            header += f" (Fold {fold_idx + 1})"
        header += " (first 10 examples) ===\n"
        print(header)
        for i, (query, vehicle_text, label) in enumerate(pairs[:10]):
            print(f"[{i+1}] Label: {label}")
            print(f"Query: {query}")
            print(f"Vehicle Text: {vehicle_text}")
            print("-" * 80)

    # Print test set preview
    print_preview(test_pairs, "test")

    # Print preview for each fold
    for fold_idx, (train_pairs, val_pairs) in enumerate(folds):
        print_preview(train_pairs, "train", fold_idx=fold_idx)
        print_preview(val_pairs, "val", fold_idx=fold_idx)

preview_cross_encoder_inputs()


=== Preview from TEST Set (first 10 examples) ===

[1] Label: 1
Query: Looking for a 5-door all-terrain electric vehicle with automatic transmission, black metallic color, and keyless-go feature. Should have a power output of less than 100 KW and mileage not exceeding 15,000 km. Also, it should have been first registered after 08.2022.
Congestion assistant
--------------------------------------------------------------------------------
[2] Label: 1
Query: Searching for an electric vehicle with a minimum of 6 airbags, LED headlights, and a park assistant. The car should have a touchscreen, voice control system, and a power output of more than 110 KW.
Congestion assistant
--------------------------------------------------------------------------------
[3] Label: 1
Query: In search of an electric vehicle with automatic tailgate and lane keeping system. The car should have tinted rear windows and a fixed panorama roof. It should also have a power output of around 120 KW.
Congestion assist

#### Start Training

In [9]:
# Initialize model
cross_encoder.initialize_model()

# Train the model
test_results = cross_encoder.train(test_pairs, folds)

INFO:__main__:Loading model: microsoft/deberta-v3-base
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Model and tokenizer loaded successfully
INFO:__main__:
INFO:__main__:Train size: 3760 | Validation size: 950


INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.1942,0.146074,0.951579,0.951782,0.947808,0.955789


INFO:__main__:Evaluating on held-out test set...


INFO:__main__:Fold 1 test results: {'eval_loss': 0.20783278346061707, 'eval_accuracy': 0.9280487804878049, 'eval_f1': 0.9286577992744861, 'eval_precision': 0.9186602870813397, 'eval_recall': 0.9388753056234719, 'eval_runtime': 6.0405, 'eval_samples_per_second': 135.75, 'eval_steps_per_second': 8.609, 'epoch': 3.0}


0,1
eval/accuracy,█▁
eval/f1,█▁
eval/loss,▁█
eval/precision,█▁
eval/recall,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▃▄▆▆▇███
train/global_step,▁▂▃▄▆▆▇███

0,1
eval/accuracy,0.92805
eval/f1,0.92866
eval/loss,0.20783
eval/precision,0.91866
eval/recall,0.93888
eval/runtime,6.0405
eval/samples_per_second,135.75
eval/steps_per_second,8.609
total_flos,2967945930178560.0
train/epoch,3.0


INFO:__main__:
INFO:__main__:Train size: 3770 | Validation size: 940


INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.1128,0.149785,0.961702,0.961538,0.965665,0.957447


INFO:__main__:Evaluating on held-out test set...


INFO:__main__:Fold 2 test results: {'eval_loss': 0.25025442242622375, 'eval_accuracy': 0.9390243902439024, 'eval_f1': 0.9396135265700483, 'eval_precision': 0.9284009546539379, 'eval_recall': 0.9511002444987775, 'eval_runtime': 5.9874, 'eval_samples_per_second': 136.955, 'eval_steps_per_second': 8.685, 'epoch': 3.0}


0,1
eval/accuracy,█▁
eval/f1,█▁
eval/loss,▁█
eval/precision,█▁
eval/recall,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▃▄▆▆▇███
train/global_step,▁▂▃▄▆▆▇███

0,1
eval/accuracy,0.93902
eval/f1,0.93961
eval/loss,0.25025
eval/precision,0.9284
eval/recall,0.9511
eval/runtime,5.9874
eval/samples_per_second,136.955
eval/steps_per_second,8.685
total_flos,2975839403397120.0
train/epoch,3.0


INFO:__main__:
INFO:__main__:Train size: 3770 | Validation size: 940


INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0929,0.124642,0.969149,0.969506,0.95842,0.980851


INFO:__main__:Evaluating on held-out test set...


INFO:__main__:Fold 3 test results: {'eval_loss': 0.21820294857025146, 'eval_accuracy': 0.9524390243902439, 'eval_f1': 0.9531812725090036, 'eval_precision': 0.9363207547169812, 'eval_recall': 0.9706601466992665, 'eval_runtime': 5.9758, 'eval_samples_per_second': 137.22, 'eval_steps_per_second': 8.702, 'epoch': 3.0}


0,1
eval/accuracy,█▁
eval/f1,█▁
eval/loss,▁█
eval/precision,█▁
eval/recall,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▃▄▆▆▇███
train/global_step,▁▂▃▄▆▆▇███

0,1
eval/accuracy,0.95244
eval/f1,0.95318
eval/loss,0.2182
eval/precision,0.93632
eval/recall,0.97066
eval/runtime,5.9758
eval/samples_per_second,137.22
eval/steps_per_second,8.702
total_flos,2975839403397120.0
train/epoch,3.0


INFO:__main__:
INFO:__main__:Train size: 3770 | Validation size: 940


INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0793,0.144886,0.965957,0.966805,0.941414,0.993603


INFO:__main__:Evaluating on held-out test set...


INFO:__main__:Fold 4 test results: {'eval_loss': 0.24107877910137177, 'eval_accuracy': 0.95, 'eval_f1': 0.9512485136741974, 'eval_precision': 0.9259259259259259, 'eval_recall': 0.9779951100244498, 'eval_runtime': 5.9944, 'eval_samples_per_second': 136.794, 'eval_steps_per_second': 8.675, 'epoch': 3.0}


0,1
eval/accuracy,█▁
eval/f1,█▁
eval/loss,▁█
eval/precision,█▁
eval/recall,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▃▄▆▆▇███
train/global_step,▁▂▃▄▆▆▇███

0,1
eval/accuracy,0.95
eval/f1,0.95125
eval/loss,0.24108
eval/precision,0.92593
eval/recall,0.978
eval/runtime,5.9944
eval/samples_per_second,136.794
eval/steps_per_second,8.675
total_flos,2975839403397120.0
train/epoch,3.0


INFO:__main__:
INFO:__main__:Train size: 3770 | Validation size: 940


INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.0665,0.059263,0.988298,0.98836,0.983158,0.993617


INFO:__main__:Evaluating on held-out test set...


INFO:__main__:Fold 5 test results: {'eval_loss': 0.24021273851394653, 'eval_accuracy': 0.9536585365853658, 'eval_f1': 0.9549763033175356, 'eval_precision': 0.9264367816091954, 'eval_recall': 0.9853300733496333, 'eval_runtime': 5.9927, 'eval_samples_per_second': 136.833, 'eval_steps_per_second': 8.677, 'epoch': 3.0}


0,1
eval/accuracy,█▁
eval/f1,█▁
eval/loss,▁█
eval/precision,█▁
eval/recall,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▃▄▆▆▇███
train/global_step,▁▂▃▄▆▆▇███

0,1
eval/accuracy,0.95366
eval/f1,0.95498
eval/loss,0.24021
eval/precision,0.92644
eval/recall,0.98533
eval/runtime,5.9927
eval/samples_per_second,136.833
eval/steps_per_second,8.677
total_flos,2975839403397120.0
train/epoch,3.0


INFO:__main__:All folds completed.


#### Test the Model using ensemble methods

In [10]:
def evaluate_ensemble_from_folds(test_pairs, threshold=0.5, num_folds: int = 5, device=None):
    """Load models from fold directories and evaluate ensemble on the test set."""
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Load tokenizer from any fold (assume same tokenizer for all)
    fold_dirs = [
        os.path.join(config.output_dir, f"fold_{i + 1}")
        for i in range(num_folds)
    ]

    tokenizer = AutoTokenizer.from_pretrained(fold_dirs[0])
    models = []
    for fold_dir in fold_dirs:
        model = AutoModelForSequenceClassification.from_pretrained(fold_dir)
        model.to(device)
        model.eval()
        models.append(model)

    # Ensemble prediction
    y_true, y_pred, y_prob = [], [], []

    for query, vehicle_text, label in test_pairs:
        encoding = tokenizer(query, vehicle_text, truncation=True, padding='max_length',
                            max_length=512, return_tensors='pt').to(device)
        probs = []
        with torch.no_grad():
            for model in models:
                output = model(**encoding)
                prob = torch.nn.functional.softmax(output.logits, dim=-1)[0][1].item()
                probs.append(prob)
        avg_prob = sum(probs) / len(probs)
        prediction = int(avg_prob >= threshold)

        y_true.append(label)
        y_pred.append(prediction)
        y_prob.append(avg_prob)

    # Compute metrics
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

    # Log
    print("\n=== Final Ensemble Metrics on Held-out Test Set ===")
    for k, v in metrics.items():
        print(f"{k.capitalize()}: {v:.4f}")

    return metrics, y_true, y_pred, y_prob

In [11]:
metrics, y_true, y_pred, y_prob = evaluate_ensemble_from_folds(test_pairs)


=== Final Ensemble Metrics on Held-out Test Set ===
Accuracy: 0.9549
Precision: 0.9346
Recall: 0.9780
F1: 0.9558


In [12]:
def show_test_predictions(test_pairs, num_examples: int = 10, threshold: float = 0.5, num_folds: int = 5):
    print(f"\n=== Ensemble Predictions on {num_examples} Random Test Examples ===\n")
    
    import random
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    import torch

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load fold models
    fold_dirs = [
        os.path.join(config.output_dir, f"fold_{i + 1}")
        for i in range(num_folds)
    ]

    tokenizer = AutoTokenizer.from_pretrained(fold_dirs[0])
    models = []
    for fold_dir in fold_dirs:
        model = AutoModelForSequenceClassification.from_pretrained(fold_dir)
        model.to(device)
        model.eval()
        models.append(model)

    # Random sample
    sampled = random.sample(test_pairs, num_examples)

    for i, (query, vehicle_text, label) in enumerate(sampled):
        # Tokenize
        encoding = tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=config.max_length,
            return_tensors='pt'
        ).to(device)

        # Get predictions from all models
        probs = []
        with torch.no_grad():
            for model in models:
                outputs = model(**encoding)
                prob = torch.nn.functional.softmax(outputs.logits, dim=-1)[0][1].item()
                probs.append(prob)

        avg_prob = sum(probs) / len(probs)
        pred = int(avg_prob >= threshold)

        print(f"[{i + 1}]")
        print(f"True Label:         {'Match' if label == 1 else 'No match'}")
        print(f"Predicted Label:    {'Match' if pred == 1 else 'No match'}")
        print(f"Avg Match Prob:     {avg_prob:.4f}")
        print(f"Individual Probs:   {[f'{p:.4f}' for p in probs]}")
        print(f"Query:              {query}")
        print(f"Vehicle Text:       {vehicle_text}")
        print("-" * 100)

show_test_predictions(test_pairs, num_examples=10, threshold=0.5, num_folds=5)


=== Ensemble Predictions on 10 Random Test Examples ===

[1]
True Label:         Match
Predicted Label:    Match
Avg Match Prob:     0.9841
Individual Probs:   ['0.9503', '0.9825', '0.9927', '0.9965', '0.9986']
Query:              Seeking a car with less than 50,000 kilometres on the clock, equipped with a head-up display, cruise control, and rain sensor. Does it have a parking assistant and servotronic as well?
Vehicle Text:       Category: Estate, 5 door | Colour: black Metallic | Colour name: Black Sapphire Metallic | Emission class: EURO 6d | Engine type: Otto | First registration: 06.2022 | Fuel type: Petrol | Location: D-47 | Power output: 135 KW / 184 PS | Read mileage: 43,700 Kilometres | Supplier: Subsidiary/authorised dealer of the brand(s): BMW | Total number of owners: 1 Owner | Transmission: Automatic | Vehicle release: Release of the vehicle will take place at the earliest 10 working days following receipt of payment. | ABS | Alarm system | Autobeam | Automatic dimming i