### Test: Amount of Tokens of our scraped information

We want to fine-tune a BERT model, yet BERT has a limitation of 512 input tokens.

After cleansing the scraped vehicle information and bringing it into a (hopefully good digestable) format for a language model, we need to find out, whether the scraped vehicle information exceeds the amount of 512 tokens.  

We are planning to use the fine-tuned models as Cross-Encoder. This means that the search query will be added to the vehicle information with a `[SEP]` token in-between before getting passed to the model. So the concatenation of both, `information text + query` has to stay under the token limitation altogether.

#### Test 1: Amount of vehicle information text tokens

The first passage of this notebook shall test the amount of tokens of our vehicle-describing-texts using `translated_vehicles_data.yaml` as resource (which contains the cleansed vehicle information), so that we can get an impression, if we need to shorten the informational texts or not.

In [None]:
# Import libraries and setup
import yaml
import re
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load vehicle data (global)
def load_vehicle_data():
    """Load and parse the vehicle YAML data"""
    with open('../../data/translated_vehicles_data.yaml', 'r', encoding='utf-8') as f:
        content = f.read()
    
    try:
        data = yaml.unsafe_load(content)
        vehicle_data = dict(data.items()) if hasattr(data, 'items') else data
    except yaml.YAMLError as e:
        print(f"Error with unsafe_load: {e}")
        # Fallback approach
        cleaned_content = re.sub(r'!!python/object/apply:collections\.defaultdict\s*', '', content)
        cleaned_content = re.sub(r'args:\s*- !!python/name:builtins\.dict\s*\'\'?\s*', '', cleaned_content)
        data = yaml.safe_load(cleaned_content)
        vehicle_data = data['dictitems'] if 'dictitems' in data else data
    
    return vehicle_data

def extract_text_from_vehicle(vehicle_data):
    """Extract all text content from a vehicle listing"""
    text_parts = []
    
    if 'details_text' in vehicle_data:
        text_parts.append(vehicle_data['details_text'])
    
    if 'details_list' in vehicle_data:
        text_parts.extend(vehicle_data['details_list'])
    
    if 'information_dict' in vehicle_data:
        for key, value in vehicle_data['information_dict'].items():
            text_parts.append(f"{key}: {value}")
    
    return ' '.join(text_parts)

def analyze_token_counts(data, limit=512):
    """Analyze token counts for all vehicles"""
    results = []
    over_limit_count = 0
    
    for url, vehicle_data in data.items():
        text = extract_text_from_vehicle(vehicle_data)
        tokens = tokenizer.encode(text, add_special_tokens=True)
        token_count = len(tokens)
        
        is_over_limit = token_count > limit
        if is_over_limit:
            over_limit_count += 1
        
        results.append({
            'url': url,
            'token_count': token_count,
            'over_limit': is_over_limit,
            'text_preview': text[:100] + "..." if len(text) > 100 else text
        })
    
    return results, over_limit_count

def check_token_limits(data, limits=[400, 425, 450]):
    """Check how many texts exceed different token limits"""
    results = {limit: 0 for limit in limits}
    total_count = len(data)
    
    print(f"Checking {total_count} vehicle listings against token limits: {limits}")
    print("=" * 60)
    
    for url, vehicle_data in data.items():
        text = extract_text_from_vehicle(vehicle_data)
        tokens = tokenizer.encode(text, add_special_tokens=False)
        token_count = len(tokens)
        
        for limit in limits:
            if token_count > limit:
                results[limit] += 1
    
    # Print results
    for limit in limits:
        over_limit = results[limit]
        percentage = (over_limit / total_count) * 100
        print(f"Token limit {limit:3d}: {over_limit:3d} over limit ({percentage:5.1f}%)")
    
    return results

# Load vehicle data once
vehicle_data = load_vehicle_data()
print(f"Loaded {len(vehicle_data)} vehicle listings")

Loaded 2510 vehicle listings


In [8]:
# Analyze token counts with 512 token limit
results, over_limit_count = analyze_token_counts(vehicle_data, limit=512)

# Print summary
total_vehicles = len(results)
print(f"=== BERT Token Analysis Summary (512 tokens) ===")
print(f"Total vehicles analyzed: {total_vehicles}")
print(f"Vehicles over 512 token limit: {over_limit_count}")
print(f"Percentage over limit: {over_limit_count/total_vehicles*100:.1f}%")
print()

# Show vehicles that exceed the limit
over_limit_vehicles = [r for r in results if r['over_limit']]
if over_limit_vehicles:
    print("=== Vehicles Exceeding 512 Token Limit ===")
    for vehicle in over_limit_vehicles[:5]:  # Show first 5 for brevity
        print(f"URL: {vehicle['url']}")
        print(f"Token count: {vehicle['token_count']}")
        print(f"Excess tokens: {vehicle['token_count'] - 512}")
        print(f"Preview: {vehicle['text_preview']}")
        print("-" * 80)
    
    if len(over_limit_vehicles) > 5:
        print(f"... and {len(over_limit_vehicles) - 5} more vehicles")

Token indices sequence length is longer than the specified maximum sequence length for this model (778 > 512). Running this sequence through the model will result in indexing errors


=== BERT Token Analysis Summary (512 tokens) ===
Total vehicles analyzed: 2510
Vehicles over 512 token limit: 141
Percentage over limit: 5.6%

=== Vehicles Exceeding 512 Token Limit ===
URL: https://autobid.de/en/item/audi-a1-allstreet-35-tfsi-s-tronic-virtual-r-kam-17-3108334/details
Token count: 778
Excess tokens: 266
Preview: Highlights:
*Reversing camera*; Assistance systems:
*Audi Pre Sense Front for adaptive speed assista...
--------------------------------------------------------------------------------
URL: https://autobid.de/en/item/audi-a3-sportback-2-0-tdi-s-tronic-ambition-3110005/details
Token count: 517
Excess tokens: 5
Preview: Assistance package (parking aid at the front and back, acoustically and optically with a selective d...
--------------------------------------------------------------------------------
URL: https://autobid.de/en/item/audi-a4-allroad-2-0-tfsi-quattro-3105849/details
Token count: 619
Excess tokens: 107
Preview: Audi Drive Select
Equipment package: a

As the result is that 141 vehicles out of 2510 are already over the token limit just given the informational text, the idea is to shorten these entries. But before we do that, we need an impression of what our real token limit for the vehicle information should be. Therefore we need a limit for the queries and an idea how long we want to allow the queries we generate to become. The tokens of an example query are going to be counted below to get a rough idea.

In [9]:
# Example query analysis
query = "I am looking for a grey, two-door small car whose first registration is no earlier than 2020, makes vetween 100 kW and 120 kW from a petrol engine, and shows under 30 000 km on the odometer."

# Count tokens for the query
query_tokens = tokenizer.encode(query, add_special_tokens=False)
query_token_count = len(query_tokens)

print(f"Query: {query}")
print(f"\nQuery token count: {query_token_count}")
print(f"Query tokens: {query_tokens}")
print(f"Decoded tokens: {tokenizer.convert_ids_to_tokens(query_tokens)}")

# For cross-encoder, account for special tokens
special_tokens_count = 3  # [CLS] + [SEP] + [SEP]
total_query_overhead = query_token_count + special_tokens_count

print(f"\nFor cross-encoder setup:")
print(f"Query tokens: {query_token_count}")
print(f"Special tokens ([CLS], [SEP], [SEP]): {special_tokens_count}")
print(f"Total query overhead: {total_query_overhead}")
print(f"Available tokens for document: {512 - total_query_overhead}")

# Test with actual cross-encoder format
def test_cross_encoder_format(query, document_text=""):
    """Test the actual token count for cross-encoder format"""
    encoded = tokenizer.encode(query, document_text, add_special_tokens=True, truncation=False)
    return len(encoded)

# Test with empty document
cross_encoder_tokens = test_cross_encoder_format(query, "")
print(f"\nActual cross-encoder tokens (query + special tokens): {cross_encoder_tokens}")

# Recommendation for document token limit
recommended_doc_limit = 512 - cross_encoder_tokens
print(f"Recommended document token limit: {recommended_doc_limit}")

Query: I am looking for a grey, two-door small car whose first registration is no earlier than 2020, makes vetween 100 kW and 120 kW from a petrol engine, and shows under 30 000 km on the odometer.

Query token count: 47
Query tokens: [1045, 2572, 2559, 2005, 1037, 4462, 1010, 2048, 1011, 2341, 2235, 2482, 3005, 2034, 8819, 2003, 2053, 3041, 2084, 12609, 1010, 3084, 29525, 28394, 2078, 2531, 6448, 1998, 6036, 6448, 2013, 1037, 17141, 3194, 1010, 1998, 3065, 2104, 2382, 2199, 2463, 2006, 1996, 1051, 26173, 3334, 1012]
Decoded tokens: ['i', 'am', 'looking', 'for', 'a', 'grey', ',', 'two', '-', 'door', 'small', 'car', 'whose', 'first', 'registration', 'is', 'no', 'earlier', 'than', '2020', ',', 'makes', 'vet', '##wee', '##n', '100', 'kw', 'and', '120', 'kw', 'from', 'a', 'petrol', 'engine', ',', 'and', 'shows', 'under', '30', '000', 'km', 'on', 'the', 'o', '##dome', '##ter', '.']

For cross-encoder setup:
Query tokens: 47
Special tokens ([CLS], [SEP], [SEP]): 3
Total query overhead: 50
Av

In [10]:
# Check how many vehicles exceed lower token limits (for cross-encoder usage)
results = check_token_limits(vehicle_data, [400, 425, 450])

Checking 2510 vehicle listings against token limits: [400, 425, 450]
Token limit 400: 358 over limit ( 14.3%)
Token limit 425: 283 over limit ( 11.3%)
Token limit 450: 234 over limit (  9.3%)


Result of this analysis:

Cutting the entries of `translated_vehicles_data.yaml` to not exceed something between 400 to 450 tokens seems sensible.

#### Test 2: Amount of query tokens

After multiple questions with different wordings have been generated for us by ChatGPT, we can check the amount of tokens these search queries have.
Different types of queries have been grouped in different batches. One example have been pushed to gitlab: `car_match_questions_batch1.zip`.

However ... this was just a first run of using LLM generated queries and rating them, and we tried to tell ChatGPT to use different styles for the queries in other batches later on, that look more like a plausible choice of words when searching for a car.

In [None]:
import zipfile
import json
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Path to an example batch file
zip_path = '../../data/car_match_questions_batch1.zip'

# Token limit to check
token_limit = 109 # If our vehicle information limit is 400 tokens

# Set to store all unique queries
queries = set()

# Read queries directly from zip file
with zipfile.ZipFile(zip_path, 'r') as zipf:
    for file_name in zipf.namelist():
        if file_name.startswith("questions_part") and file_name.endswith(".json"):
            with zipf.open(file_name) as f:
                data = json.load(f)
                queries.update(data.keys())

print(f"Loaded {len(queries)} unique queries from the zip archive.")

# Analyze token lengths
over_limit_count = 0
query_token_lengths = []
max_token_count = 0
longest_query = ""

for query in queries:
    tokens = tokenizer.encode(query, add_special_tokens=True)
    token_count = len(tokens)
    query_token_lengths.append((query, token_count))
    if token_count > token_limit:
        over_limit_count += 1
    if token_count > max_token_count:
        max_token_count = token_count
        longest_query = query

print(f"{over_limit_count} out of {len(queries)} queries exceed {token_limit} tokens.")
print(f"\nThe longest query is {max_token_count} tokens long.")
print("Query text:")
print(longest_query)

Loaded 998 unique queries from the zip archive.
0 out of 998 queries exceed 109 tokens.

The longest query is 42 tokens long.
Query text:
With 6 or more speeds, fitted with rear parking sensors, first registered between 2015 and 2022, making between 59 kW and 140 kW, showing between 70900 km and 124300 km.
