In [2]:
import yaml

def get_vehicles_as_dict(file_path):
    """
    Parse YAML file and return a dictionary with URLs as keys and YAML text as values.

    Args:
        file_path (str): Path to the YAML file

    Returns:
        dict: Dictionary where keys are URLs and values are YAML text for each vehicle
    """
    # Load the YAML file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = yaml.safe_load(file)

    # Create dictionary with URLs as keys and YAML text as values
    vehicles_dict = {}
    for url, vehicle_info in data.items():
        # Convert just the vehicle info (not including the URL) to YAML text
        yaml_text = yaml.dump(vehicle_info, default_flow_style=False, allow_unicode=True)
        vehicles_dict[url] = yaml_text

    return vehicles_dict

In [3]:
vehicle_data_file = "../../data/truncated_vehicles_data.yaml"
vehicle_data = get_vehicles_as_dict(vehicle_data_file)


In [11]:
import re
def replace_german_mileage(s):
    return re.sub(
        r'(Read mileage:\s+)([\d.]+)(\s+Kilometres)',
        lambda m: f"{m.group(1)}{m.group(2).replace('.', ',')}{m.group(3)}",
        s
    )

In [4]:
system_prompt = """
You are a helpful assistant that evaluates search queries based on detailed car descriptions. The user provides vehicle details in structured format. You must generate five realistic search queries that match the car and another five that don't (but aren't absurd), and return them as a JSON object where each key is a query and the value is a boolean indicating whether the query matches the car (true) or not (false). Respond only with the JSON object, nothing else.
"""

In [27]:
question = """
Please create realistic search queries of someone who is searching for a car. Keep in mind that, when searching for a vehicle,
they won't already know details like exact read mileage and horse power. Mileage and horse power are important but a car dealer would
ask for a broad range. When asking for kilometers, hp or the number of previous owners always pick a specific number
(not the one from the text) and ask if the car has more or less of it. Milage is important so some queries should ask if it has more and some if it has less.
The same is true for registration date and horsepower/kW.
Also try to be specific and put in numbers for details that need them. Something like "low mileage" and "powerful engine" could be interpreted differently.
The search won't contain every detail of the car. Also vary the wording
and the chosen details of the search question for the queries and use synonyms for some of them. Make the queries multiple sentences
long and detailed. The car dealer has specific requirements.
When looking at the car details, the "information_dict:" contains the most valuable information about the car.

After that generate also 5 similar queries that don't match the car completely. The negative queries should contain some of the real vehicle details
but differ in some (some should be closer and some more different). The negative examples
shouldn't be too absurd and fit to searches a second hand car dealer could have. They should be detailed. Include
other details this car hasn't but are common for other cars. The negative queries should be close to the original car (and the true queries), but
differ in a few important details.
Make the negative queries multiple sentences long and detailed. Add enough details so the overall negative questions are similiar in length to the positive ones.

Some queries should be longer and some should be shorter. Some should be more detailed than others. Also the writing style should differ slightly.
This is true for the positive and negative queries. When there is already a question related to a detail, try to focus on another detail or use a synonym.

To JSON:

Please put the queries into a json item.
Like this:
{
"This is a matching query": true,
"This is another matching query": true,
"This is not a matching query": false,
}

Only respond with the json item you created.

"""


url, yaml_text = list(vehicle_data.items())[2]
user_prompt = replace_german_mileage(yaml_text) + question
print(user_prompt)


details_list:
- ABS
- 'Airbags: 6'
- Climatronic
- Cruise control
- Electric windows front and rear
- Fog light
- Leather steering wheel
- Multi-function steering wheel
- On-board computer
- Parking aid ex works rear
- Power steering
- Radio
- Remote central locking
- 'Satnav with map views: Infotainment-System: UConnect mit CD- und MP3-Player, Navigationssystem
  und DAB Radioempfang'
- Side mirror electrically adjustable
- Stability program ESP (Electronic Stability Program)
- Start/stop system
- 'Telephone: Preparation with Bluetooth'
- Voice control system
details_text: 'Equipment package: Lusso (Infotainment system: UConnect with CD and
  MP3 player, navigation system and DAB reception, Blue & Me Bluetooth hands-free
  system with voice recognition and USB port, seat upholstery: Exclusive leather,
  Venere, lumbar support for front left and right seats, electrically adjustable)

  Rear parking aid

  Front center armrest

  Seat upholstery: Exclusive leather, Venere, with stitchin

In [28]:
import openai

api_key = "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"

client = openai.OpenAI(api_key=api_key)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature = 0.7
)

chatgpt_answer = response.choices[0].message.content
print(chatgpt_answer)


{
"Looking for a red, saloon type, 5 door vehicle, with manual transmission and petrol as fuel type. It should be equipped with ABS, climatronic, cruise control, and a multi-function steering wheel. Should have less than 150,000 kilometres on it and be first registered after 2014.": true,
"Seeking a vehicle with a power output of over 110 PS and a start/stop system. It should come with a parking aid system and be equipped with an on-board computer. The car should also be set up for Bluetooth and have a voice control system.": true,
"Searching for a car with an Otto engine type, produced by a manufacturer with a Federal Motor Transport Authority (KBA) Key Manufacturer code of '4136'. The vehicle should have a stability program ESP and electric windows at the front and rear.": true,
"I'm interested in a car with a leather steering wheel, remote central locking, and a Satnav with map views. It should have been first registered in 2015 or later and offer more than 85 KW power output.": tru

In [None]:
for url, yaml_text in list(vehicle_data.items()):
    print(f"URL: {url}")
    print(f"Content:\n{yaml_text}")