In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, AutoConfig
)
import json
import yaml
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import logging
from typing import Dict, List, Tuple, Optional
import os
from dataclasses import dataclass
import wandb
wandb.login(key="your-key")

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpinkfloat[0m ([33mpinkfloat-berliner-hochschule-f-r-technik[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import transformers
print(transformers.__version__)

4.41.2


In [3]:
vehicle_data_path = "../../data/vehicles_info.yaml"
queries_path = "../../data/generated_questions.json"

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Settings for Training

In [4]:
@dataclass
class ModelConfig:
    """Configuration for the fine-tuning process"""
    model_name: str = "roberta-base"  # or "microsoft/deberta-v3-base"
    max_length: int = 512
    learning_rate: float = 2e-5
    num_epochs: int = 3
    batch_size: int = 16
    warmup_steps: int = 100
    weight_decay: float = 0.01
    output_dir: str = "./roberta"
    save_steps: int = 500
    eval_steps: int = 500
    logging_steps: int = 100


config = ModelConfig()
wandb.init(project="vehicle-cross-encoder", config=config.__dict__)
run_name = wandb.run.name

config.output_dir = os.path.join(config.output_dir, run_name)


In [5]:
class VehicleDataset(Dataset):
    """Dataset class for vehicle-query pairs"""
    
    def __init__(self, data_pairs: List[Tuple[str, str, int]], tokenizer, max_length: int = 512):
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data_pairs)
    
    def __getitem__(self, idx):
        query, vehicle_text, label = self.data_pairs[idx]
        
        # Tokenize the query-vehicle pair
        # Following cross-encoder format: [CLS] query [SEP] vehicle_text [SEP]
        encoding = self.tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [6]:
class VehicleCrossEncoder:
    """Main class for fine-tuning cross-encoder models"""
    
    def __init__(self, config: ModelConfig):
        self.config = config
        self.tokenizer = None
        self.model = None
        self.trainer = None
        
    def load_data(self, vehicles_file: str, questions_file: str) -> Tuple[List, List, List]:
        """Load and prepare data from YAML and JSON files with proper vehicle-level splitting"""
        
        # Load vehicle data
        with open(vehicles_file, 'r', encoding='utf-8') as f:
            vehicles_data = yaml.safe_load(f)
        
        # Load questions data
        with open(questions_file, 'r', encoding='utf-8') as f:
            questions_data = json.load(f)
        
        # Prepare vehicle-level data first
        vehicle_datasets = []
        
        for vehicle_url, vehicle_info in vehicles_data.items():
            # Create vehicle text description
            vehicle_text = self._create_vehicle_description(vehicle_info)
            
            # Get questions for this vehicle
            if vehicle_url in questions_data:
                questions = questions_data[vehicle_url]
                
                # Create all pairs for this vehicle
                vehicle_pairs = []
                for question, label in questions.items():
                    vehicle_pairs.append((question, vehicle_text, int(label)))
                
                # Store vehicle dataset (all pairs for one vehicle)
                vehicle_datasets.append({
                    'url': vehicle_url,
                    'pairs': vehicle_pairs
                })
        
        # Split vehicles into train/validation/test (70/15/15 split)
        import random
        random.seed(42)  # For reproducibility
        random.shuffle(vehicle_datasets)
        
        total_vehicles = len(vehicle_datasets)
        train_split = int(total_vehicles * 0.70)
        val_split = int(total_vehicles * 0.85)
        
        train_vehicles = vehicle_datasets[:train_split]
        val_vehicles = vehicle_datasets[train_split:val_split]
        test_vehicles = vehicle_datasets[val_split:]
        
        # Flatten into pairs while keeping vehicle grouping
        train_pairs = []
        val_pairs = []
        test_pairs = []
        
        for vehicle_data in train_vehicles:
            train_pairs.extend(vehicle_data['pairs'])
        
        for vehicle_data in val_vehicles:
            val_pairs.extend(vehicle_data['pairs'])
            
        for vehicle_data in test_vehicles:
            test_pairs.extend(vehicle_data['pairs'])
        
        # Shuffle the training pairs randomly
        random.shuffle(train_pairs)
        
        logger.info(f"Vehicle-level split:")
        logger.info(f"  Train vehicles: {len(train_vehicles)} ({len(train_pairs)} pairs)")
        logger.info(f"  Validation vehicles: {len(val_vehicles)} ({len(val_pairs)} pairs)")
        logger.info(f"  Test vehicles: {len(test_vehicles)} ({len(test_pairs)} pairs)")
        
        return train_pairs, val_pairs, test_pairs
    
    def _create_vehicle_description(self, vehicle_info: Dict) -> str:
        """Create a comprehensive vehicle description from the data"""
        description_parts = []
        
        # Add information dictionary details
        if 'information_dict' in vehicle_info:
            info_dict = vehicle_info['information_dict']
            for key, value in info_dict.items():
                description_parts.append(f"{key}: {value}")
        
        # Add details list
        if 'details_list' in vehicle_info:
            details = " | ".join(vehicle_info['details_list'])
            description_parts.append(details)
        
        # Add details text if available
        if 'details_text' in vehicle_info:
            description_parts.append(vehicle_info['details_text'])
        
        return " | ".join(description_parts)
    
    def initialize_model(self):
        """Initialize tokenizer and model"""
        logger.info(f"Loading model: {self.config.model_name}")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        
        # Load model configuration and modify for binary classification
        model_config = AutoConfig.from_pretrained(self.config.model_name)
        model_config.num_labels = 2  # Binary classification
        
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name,
            config=model_config,
            ignore_mismatched_sizes=True 
        )
        
        logger.info("Model and tokenizer loaded successfully")
    
    def compute_metrics(self, eval_pred):
        """Compute metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def train(self, train_pairs: List, val_pairs: List, test_pairs: List):
        """Train the model with proper train/validation/test splits"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model not initialized. Call initialize_model() first.")
        
        # Create datasets
        train_dataset = VehicleDataset(train_pairs, self.tokenizer, self.config.max_length)
        val_dataset = VehicleDataset(val_pairs, self.tokenizer, self.config.max_length)
        test_dataset = VehicleDataset(test_pairs, self.tokenizer, self.config.max_length)
        
        # Set up training arguments
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_epochs,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            learning_rate=self.config.learning_rate,
            weight_decay=self.config.weight_decay,
            warmup_steps=self.config.warmup_steps,
            logging_steps=self.config.logging_steps,
            evaluation_strategy="steps",
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=2,
            report_to="wandb"
        )
        
        # Initialize trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,  # Use validation set for evaluation during training
            compute_metrics=self.compute_metrics,
        )
        
        # Train the model
        logger.info("Starting training...")
        self.trainer.train()
        
        # Evaluate on test set
        logger.info("Evaluating on test set...")
        test_results = self.trainer.evaluate(test_dataset)
        logger.info(f"Test results: {test_results}")
        
        # Save the final model
        self.trainer.save_model()
        self.tokenizer.save_pretrained(self.config.output_dir)
        
        logger.info(f"Training completed. Model saved to {self.config.output_dir}")
        
        return test_results

    def predict(self, query: str, vehicle_text: str) -> Tuple[float, int]:
        """Make a prediction for a query-vehicle pair"""
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model not initialized or trained.")
        
        # Tokenize input
        encoding = self.tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=self.config.max_length,
            return_tensors='pt'
        )
        
        # Make prediction
        with torch.no_grad():
            outputs = self.model(**encoding)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
        # Get probability of positive class (index 1)
        positive_prob = predictions[0][1].item()
        predicted_label = int(positive_prob > 0.5)
        
        return positive_prob, predicted_label

**About the train / val / test split:**  
Each set contains the complete vehicle_info x 10 corresponding queries combinations to avoid data leakage about the cars between the sets. The test set is randomly shuffled. The other two sets are not, which is why the `preview_cross_encoder_inputs()` function will show the vehicle_info x 10 corresponding queries combinations sorted for them.

In [7]:
def preview_cross_encoder_inputs():
    # Load config and initialize class
    cross_encoder = VehicleCrossEncoder(config)

    # Load the updated data splits
    train_pairs, val_pairs, test_pairs = cross_encoder.load_data(
        vehicles_file=vehicle_data_path,
        questions_file=queries_path
    )

    # Preview function
    def print_preview(pairs, set_name):
        print(f"\n=== Preview from {set_name.upper()} Set (first 10 examples) ===\n")
        for i, (query, vehicle_text, label) in enumerate(pairs[:10]):
            print(f"[{i+1}] Label: {label}")
            print(f"Query: {query}")
            print(f"Vehicle Text: {vehicle_text}")
            print("-" * 80)

    # Print samples from each set
    print_preview(train_pairs, "train")
    print_preview(val_pairs, "validation")
    print_preview(test_pairs, "test")

preview_cross_encoder_inputs()

INFO:__main__:Vehicle-level split:
INFO:__main__:  Train vehicles: 387 (3870 pairs)
INFO:__main__:  Validation vehicles: 83 (830 pairs)
INFO:__main__:  Test vehicles: 83 (830 pairs)



=== Preview from TRAIN Set (first 10 examples) ===

[1] Label: 1
Query: Looking for a blue Metallic Estate car with a petrol engine and automatic transmission. It should have features like keyless-go, LED headlights, and a touchscreen. Preferably with less than 20,000 kilometres on the odometer.
Vehicle Text: Category: Estate, 5 door | Colour: blue Metallic | Colour name: ascariblau | Engine type: Otto | First registration: 04.2023 | Fuel type: Petrol | Location: D-49 | Power output: 110 KW / 150 PS | Read mileage: 15,400 Kilometres | Supplier: Deutsche Bahn Connect GmbH | Total number of owners: 1 Owner | Transmission: Automatic | Vehicle origin: Rental car | Vehicle release: Release of the vehicle will take place at the earliest 5 working days following receipt of payment. | ABS | Airbags: 6 | Auto Hold | Autobeam | Automatic dimming exterior mirror | Automatic dimming internal mirror | Climatronic | Cruise control | Electric boot lid | Electric windows front and rear | Heated exter

Start Training:

In [8]:
config = ModelConfig()

# Initialize the cross-encoder
cross_encoder = VehicleCrossEncoder(config)

# Load data
train_pairs, val_pairs, test_pairs = cross_encoder.load_data(
    vehicles_file=vehicle_data_path,
    questions_file=queries_path
)

# Initialize model
cross_encoder.initialize_model()

# Train the model
test_results = cross_encoder.train(train_pairs, val_pairs, test_pairs)

INFO:__main__:Vehicle-level split:
INFO:__main__:  Train vehicles: 387 (3870 pairs)
INFO:__main__:  Validation vehicles: 83 (830 pairs)
INFO:__main__:  Test vehicles: 83 (830 pairs)
INFO:__main__:Loading model: roberta-base
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Model and tokenizer loaded successfully
INFO:__main__:Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.2433,0.24044,0.919277,0.922543,0.884701,0.963768


INFO:__main__:Evaluating on test set...


INFO:__main__:Test results: {'eval_loss': 0.2571873366832733, 'eval_accuracy': 0.9180722891566265, 'eval_f1': 0.9214780600461894, 'eval_precision': 0.88470066518847, 'eval_recall': 0.9614457831325302, 'eval_runtime': 4.2724, 'eval_samples_per_second': 194.27, 'eval_steps_per_second': 12.171, 'epoch': 3.0}
INFO:__main__:Training completed. Model saved to ./roberta


In [9]:
def show_test_predictions(num_examples: int = 10):
    print(f"\n=== Predictions on {num_examples} Random Test Examples ===\n")
    
    import random
    sampled = random.sample(test_pairs, num_examples)

    model = cross_encoder.model
    tokenizer = cross_encoder.tokenizer
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for i, (query, vehicle_text, label) in enumerate(sampled):
        # Tokenize
        encoding = tokenizer(
            query,
            vehicle_text,
            truncation=True,
            padding='max_length',
            max_length=config.max_length,
            return_tensors='pt'
        )

        # Move tensors to device
        encoding = {k: v.to(device) for k, v in encoding.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**encoding)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            match_prob = probs[0][1].item()
            pred = int(match_prob > 0.5)

        print(f"[{i+1}]")
        print(f"True Label:        {'Match' if label == 1 else 'No match'}")
        print(f"Predicted Label:   {'Match' if pred == 1 else 'No match'}")
        print(f"Match Probability: {match_prob:.4f}")
        print(f"Query:             {query}")
        print(f"Vehicle Text:      {vehicle_text}")
        print("-" * 100)

# Call it
show_test_predictions(10)


=== Predictions on 10 Random Test Examples ===

[1]
True Label:        Match
Predicted Label:   Match
Match Probability: 0.9790
Query:             Looking for a vehicle with a satnav system and ready for telephone connection. The car should be a rental origin and have amenities like automatic dimming internal mirror and remote central locking.
Vehicle Text:      Category: Estate, 5 door | Colour: black Metallic | Emission class: EURO 5 | Engine type: Diesel | First registration: 01.2013 | Fuel type: Diesel | Location: D-59 | Power output: 105 KW / 143 PS | Read mileage: 195,100 Kilometres | Supplier: Subsidiary/authorised dealer of the brand(s): Audi | Total number of owners: 2 Owner | Transmission: 6-gear manual transmission | Vehicle origin: Rental car | ABS | Airbags: 6 | Automatic dimming internal mirror | Automatic tailgate | Bi-xenon | Climatronic | Cruise control | Electric windows front and rear | Fog light | Leather steering wheel | Multi-function steering wheel | On-board co