In [1]:
import yaml
import re

def clean_vehicle_yaml(input_file, output_file):
    """
    Clean vehicle YAML file by:
    1. Converting German-style numbers (with dots) to standard format (with commas) in mileage
    2. Removing Vehicle Identification No. fields
    3. Removing KBA Key fields
    """

    # Read the YAML file
    with open(input_file, 'r', encoding='utf-8') as file:
        data = yaml.safe_load(file)

    # Process each vehicle entry
    for vehicle_url, vehicle_data in data.items():
        if 'information_dict' in vehicle_data:
            info_dict = vehicle_data['information_dict']

            # Convert German-style mileage numbers (dots to commas)
            if 'Read mileage' in info_dict:
                mileage = info_dict['Read mileage']
                # Replace dots with commas in numbers (e.g., "23.500" -> "23,500")
                # This regex finds numbers with dots and replaces dots with commas
                info_dict['Read mileage'] = re.sub(r'(\d+)\.(\d+)', r'\1,\2', mileage)

            # Remove Vehicle Identification No. field
            if 'Vehicle Identification No.' in info_dict:
                del info_dict['Vehicle Identification No.']

            # Remove KBA Key fields (both manufacturer and type)
            keys_to_remove = []
            for key in info_dict.keys():
                if 'KBA' in key and 'Key' in key:
                    keys_to_remove.append(key)

            for key in keys_to_remove:
                del info_dict[key]

    # Write the cleaned data back to YAML
    with open(output_file, 'w', encoding='utf-8') as file:
        yaml.dump(data, file, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"Cleaned YAML saved to {output_file}")

In [None]:
in_file = "../../data/truncated_vehicles_data.yaml"
out_file = "../../data/final_vehicles_data.yaml"

clean_vehicle_yaml(in_file, out_file)

FileNotFoundError: [Errno 2] No such file or directory: '../data/truncated_vehicles_data.yaml'