In [1]:
from openai import OpenAI
import os
import pandas as pd
import base64
import io
from PIL import Image
import os
import json
import re

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
print(client.models.list())
model_name = 'google/gemma-3-27b'
output_dir = f"results_{model_name}"
character_schema = {
    "type": "json_schema",
    "json_schema": {
        "name": "Judgment",
        "schema": {
            "type": "object",
            "properties": {
                "Judgment": {"type": "string"},
                "Reasons": {"type": "string"}
            },
            "required": ["Judgment","Reasons"]
        },
    }
}
def escape_inner_quotes(json_str):
    reasons_match = re.search(r'"Reasons"\s*:\s*"(.*?)"', json_str, re.DOTALL)
    if reasons_match:
        original_reasons = reasons_match.group(1)
        cleaned_reasons = original_reasons.replace('"', '')
        json_str = json_str.replace(original_reasons, cleaned_reasons)
    return json_str
# Create the folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

SyncPage[Model](data=[Model(id='text-embedding-nomic-embed-text-v1.5@q4_k_m', created=None, object='model', owned_by='organization_owner'), Model(id='google/gemma-3-27b', created=None, object='model', owned_by='organization_owner'), Model(id='qwen/qwen2.5-vl-7b', created=None, object='model', owned_by='organization_owner'), Model(id='gemma-3-4b-it-qat', created=None, object='model', owned_by='organization_owner'), Model(id='llava-v1.5-7b', created=None, object='model', owned_by='organization_owner'), Model(id='granite-vision-3.2-2b', created=None, object='model', owned_by='organization_owner'), Model(id='text-embedding-nomic-embed-text-v1.5@q8_0', created=None, object='model', owned_by='organization_owner')], object='list')


### Zero-shot1

In [None]:
# Load metadata
prompt_method = "Zero-shot1-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:04d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose LLM prompt
    prompt_text = f"""
                    You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
                    Which side looks more {study_question}?

                    Answer with only one word: left or right. Then explain your reasoning.

                    Format:
                    Judgment. Reasons.
                    """

    # Call LLM 
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")

left The presence of multiple parked cars on the left side contributes to a perception of activity. Additionally, several individuals are visible walking along the sidewalk, which is associated with increased street-level engagement. The taller buildings in the background also play a role in shaping the impression of higher population density and potential for more frequent interactions. Conversely, the right side features fewer cars and no immediately visible pedestrians; this decreases the perception of current use. The lower building heights on the right are often linked to smaller populations, which influences the overall sense of liveliness.
left The presence of fewer pedestrians and vehicles on the left side contributes to a perception of lower risk. The wider, more open view of the road in Boston is associated with increased visibility, which plays a role in shaping the sense of safety. Conversely, the higher density of parked cars and moving traffic on the right side increases 

### Zero-shot1 with prompt guidance

In [146]:
# Load metadata
prompt_method = "Zero-shot1-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:04d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose LLM prompt
    prompt_text = f"""
                    You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
                    Which side looks more {study_question}?

                    Based on the following six factors, what is the most important reason for your decision? Provide your top three ranked factors and explain your reasoning.
                    
                    1. Cleanliness – Absence of litter, graffiti, or visible disrepair.
                    Example: The area looks clean, with no visible trash or vandalism.

                    2. Maintenance – Condition of roads, signage, pavements, and other infrastructure.
                    Example: The markings, signals, and fencing appear well-maintained.

                    3. Vegetation and Greenery – Presence and upkeep of trees and green spaces, reflecting care and order.
                    Example: Well-maintained trees line the street, contributing positively.

                    4. Pedestrian Safety Features – Sidewalks, railings, and crossings indicating pedestrian-oriented planning.
                    Example: Sidewalks and fencing suggest the area is pedestrian-friendly.

                    5. Traffic Behavior and Road Orderliness – Visible lane discipline, rule-following, and organized traffic flow.
                    Example: Vehicles follow lane markings and seem to obey traffic rules.

                    6. Surveillance and Human Presence – Visibility of people or signs of monitoring (e.g., cameras) enhancing perceived safety.
                    Example: Pedestrians are visible, and the open environment allows for passive surveillance.
                    
                    Answer with only one word: left or right. Then explain your reasoning.

                    Format:
                    Judgment. Reasons.
                    """

    # Call LLM 
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}_all.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")

left Based on the provided image, the Paris side (left) appears more lively for these reasons:

1. **Surveillance and Human Presence:** The left side shows significantly more people walking along the street and parked cars, indicating a higher level of activity and use. This suggests a busier, more populated area.
2. **Vegetation and Greenery:**  The presence of well-maintained green spaces (grass areas and trees) on the left contributes to a more inviting and pleasant atmosphere, which often correlates with liveliness.
3. **Maintenance:** While both sides appear reasonably maintained, the Paris side has slightly cleaner sidewalks and better-defined street markings, suggesting a greater level of care and upkeep that supports pedestrian activity.
left Based on the image provided, the Boston side (left) appears safer due to a combination of factors suggesting better order and less potential for chaotic situations.

1. **Traffic Behavior and Road Orderliness:** The road in Boston looks wi

OSError: Cannot save file into a non-existent directory: 'results_google\gemma-3-27b\llm_predictions_google'

In [155]:
df_result.to_csv(f"{output_dir}/llm_predictions_gemma-3-27b_{prompt_method}_all.csv", index=False)

ratio_number=0.7
# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

vote_total = df_result["left_vote"] + df_result["right_vote"]
left_ratio = df_result["left_vote"] / vote_total
right_ratio = df_result["right_vote"] / vote_total

filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal") &
    ((left_ratio >= ratio_number) | (right_ratio >= ratio_number))
]
# Identify rows with dominant votes
high_confidence_mask = (left_ratio >= ratio_number) | (right_ratio >= ratio_number)
high_confidence_count = high_confidence_mask.sum()

print(f"📊 Total rows with ≥70% majority vote: {high_confidence_count}")

correct_high_confidence = df_result[high_confidence_mask]["validation"].sum()
accuracy_high_conf = correct_high_confidence / high_confidence_count

print(f"✅ Accuracy among high-confidence rows: {accuracy_high_conf:.2%}")

📊 Total rows with ≥70% majority vote: 4899
✅ Accuracy among high-confidence rows: 62.16%


In [None]:
# Load metadata
prompt_method= "Zero_Shot2-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    prompt_text = f"""
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}. Based on the overall impression, decide which image better reflects the following quality:

        "{study_question}"

        Respond with a one-word judgment: left or right. Then explain your reasoning.

        Format:
        Judgment. Reasons.
        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")


left The Paris street view image on the left appears more lively and vibrant compared to the Dublin image on the right. The Parisian scene is bustling with people, cars, and buildings, creating a sense of energy and activity. In contrast, the Dublin image shows a relatively empty and quiet street, lacking the same level of human interaction and visual stimulation.
left The left image from Boston appears to be in a more urban and densely populated area, which might suggest a higher level of activity and potentially less safety compared to the right image from Sydney, which seems to be in a more open and suburban environment. The lack of people and vehicles on the road in the right image also contributes to this impression.
left The left image from Montreal appears to be safer than the right image from Warsaw. The streets in the left image are well-lit, with no signs of trash or debris on the sidewalks. In contrast, the right image shows dark and narrow alleys, which could potentially po

### Zero-Shot2

In [None]:
# Load metadata
prompt_method= "Zero_Shot2-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    prompt_text = f"""
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}. Based on the overall impression, decide which image better reflects the following quality:

        "{study_question}"

        Respond with a one-word judgment: left or right. Then explain your reasoning.

        Format:
        Judgment. Reasons.
        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")


left The Paris street view image on the left appears more lively and vibrant compared to the Dublin image on the right. The Parisian scene is bustling with people, cars, and buildings, creating a sense of energy and activity. In contrast, the Dublin image shows a relatively empty and quiet street, lacking the same level of human interaction and visual stimulation.
left The left image from Boston appears to be in a more urban and densely populated area, which might suggest a higher level of activity and potentially less safety compared to the right image from Sydney, which seems to be in a more open and suburban environment. The lack of people and vehicles on the road in the right image also contributes to this impression.
left The left image from Montreal appears to be safer than the right image from Warsaw. The streets in the left image are well-lit, with no signs of trash or debris on the sidewalks. In contrast, the right image shows dark and narrow alleys, which could potentially po

#Zero-Shot3_reasoning_with_neutral_word

In [None]:
# Load metadata
prompt_method = "Zero-shot3-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:04d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose LLM prompt
    prompt_text = f"""
        You are shown a side-by-side image with two street views from two different cities: the left half is from {left_place}, and the right half is from {right_place}.

        Which side appears more {study_question}? Respond with only one word: left or right.

        Then explain your reasoning using **neutral, causal language**. Describe which visual features are present in the selected image, and how those features influence your judgment. Use phrases like:
        - 'X contributes to Y'
        - 'X is associated with Y'
        - 'X increases the perception of Y'
        - 'X plays a role in shaping Y'

        Avoid emotional or evaluative language.

        **Output format:**
        Judgment. Reasoning.
        """

    # Call LLM 
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")

### Chain-of-Thought

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "COT-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    prompt_text = f"""
        Let’s think step by step.
        You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
        
        Which side looks more {study_question}?

        What features do you see in the left image? What features in the left image contribute to or detract from that quality?

        What features do you see in the right image? What features in the right image contribute to or detract from that quality?

        Based on your reasoning, which image better reflects the quality?

        Respond with a one-word judgment: left or right. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


left The left side of the image appears more lively due to several factors. Firstly, the street is lined with trees and buildings, which creates a sense of depth and visual interest. The presence of people walking on the sidewalk also adds to the liveliness of the scene. In contrast, the right side of the image shows an empty street with no signs of life, making it appear less lively.
left The left side of the image appears to be safer due to several factors. Firstly, there are no pedestrians or vehicles in sight, which reduces the risk of accidents. Additionally, the road is well-lit and has a clear lane division, indicating proper maintenance and safety measures. In contrast, the right side shows a busy street with multiple vehicles and pedestrians, increasing the likelihood of accidents. Furthermore, the lighting on the right side appears to be dimmer, which could also contribute to reduced visibility and increased risk.
left The left side of the image appears to be safer due to sev

### Rule-Base + In-context learning

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "RBIL-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

def get_visual_perspective(study_question):
    rules = {
        "wealthier": (
            "Look for expensive cars, clean sidewalks, modern buildings, well-maintained facades, greenery, and overall tidiness. "
            "Signs of poverty such as trash, broken sidewalks, and older buildings detract from the feeling of wealth."
        ),
        "more beautiful": (
            "Aesthetics matter: symmetry, architectural design, vibrant colors, trees, flowers, sunlight, and open space. "
            "Avoid cluttered, gray, damaged, or visually unappealing features."
        ),
        "livelier": (
            "Look for crowds, people walking or biking, street vendors, bright signage, open businesses, and dynamic movement. "
            "Quiet, empty, or static scenes are less lively."
        ),
        "more depressing": (
            "Indicators include dark lighting, gray tones, boarded-up buildings, graffiti, trash, empty streets, and general neglect. "
            "Fewer people and lack of activity can amplify the depressing feeling."
        ),
        "safer": (
            "Well-lit areas, visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs suggest safety. "
            "Broken lights, isolated alleys, graffiti, and damaged property indicate unsafety."
        ),
        "more boring": (
            "Uniform design, lack of people or variation, blank walls, closed businesses, and monotone architecture signal boredom. "
            "Diversity in design and visible life make a place less boring."
        )
    }

    return rules.get(study_question.lower(), "No rule found for this category.")

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    #print(left_place,right_place)
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    visual_perspective = get_visual_perspective(study_question) 
    #print(visual_perspective)
    prompt_text = f"""
        You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
        
        Which side looks more {study_question}?

        Use the following rules when making your decision:

        "{visual_perspective}"
        
        Then, apply the rules to the images and respond with a one-word judgment: left or right. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)
    if match:
        try:
            json_str = match.group(0)
            parsed = json.loads(json_str)
            model_judgement = parsed.get("Judgment", "").strip().lower()
            model_reason = parsed.get("Reasons", "").strip()
        except json.JSONDecodeError:
            #print("❌ JSON found but failed to parse. Falling back.")
            model_judgement = ""
            model_reason = ""
    else:
        # Fallback: Extract via regex
        #print("❌ No valid JSON object found in model output.")
        #print("🔎 Raw content was:\n", raw)
        model_judgement = ""
        model_reason = ""

        fallback_match = re.search(r'Judgment:\s*(\w+).*?Reasons:\s*(.+)', raw, re.DOTALL | re.IGNORECASE)
        if fallback_match:
            model_judgement = fallback_match.group(1).strip().lower()
            model_reason = fallback_match.group(2).strip()
    print(model_judgement,model_reason)

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


left The left side of the image appears more lively as it shows a bustling street scene in Paris. There are people walking, biking, and standing on the sidewalk, indicating a high level of activity. Additionally, there are street vendors and bright signage visible, which further supports this conclusion. In contrast, the right side of the image depicts a relatively empty and static scene in Dublin, with no signs of crowds or movement. This stark difference in visual content leads me to conclude that the left side is more lively.
left The left side of the image appears safer as it is well-lit, has visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs. In contrast, the right side lacks these safety indicators, with broken lights, isolated alleys, graffiti, and damaged property, suggesting unsafety.
left The image on the left side (Montreal) appears safer compared to the image on the right side (Warsaw). The Montreal image shows well-lit areas, v

### RBIL_COT

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "RBIL_COT-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

def get_visual_perspective(study_question):
    rules = {
        "wealthier": (
            "Look for expensive cars, clean sidewalks, modern buildings, well-maintained facades, greenery, and overall tidiness. "
            "Signs of poverty such as trash, broken sidewalks, and older buildings detract from the feeling of wealth."
        ),
        "more beautiful": (
            "Aesthetics matter: symmetry, architectural design, vibrant colors, trees, flowers, sunlight, and open space. "
            "Avoid cluttered, gray, damaged, or visually unappealing features."
        ),
        "livelier": (
            "Look for crowds, people walking or biking, street vendors, bright signage, open businesses, and dynamic movement. "
            "Quiet, empty, or static scenes are less lively."
        ),
        "more depressing": (
            "Indicators include dark lighting, gray tones, boarded-up buildings, graffiti, trash, empty streets, and general neglect. "
            "Fewer people and lack of activity can amplify the depressing feeling."
        ),
        "safer": (
            "Well-lit areas, visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs suggest safety. "
            "Broken lights, isolated alleys, graffiti, and damaged property indicate unsafety."
        ),
        "more boring": (
            "Uniform design, lack of people or variation, blank walls, closed businesses, and monotone architecture signal boredom. "
            "Diversity in design and visible life make a place less boring."
        )
    }

    return rules.get(study_question.lower(), "No rule found for this category.")

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    visual_perspective = get_visual_perspective(study_question) 
    #print(visual_perspective)
    prompt_text = f"""
        Let’s think step by step.
        You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
        
        Which side looks more {study_question}?

        What features do you see in the left image? What features in the left image contribute to or detract from that quality?

        What features do you see in the right image? What features in the right image contribute to or detract from that quality?

        Based on the following rules, which image has features that better reflect the quality?

        "{visual_perspective}"
        
        Then, apply the rules to the images and respond with a one-word judgment: left or right. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    raw = response.choices[0].message.content.strip()
    raw = (
    raw.replace("“", '"')
       .replace("”", '"')
       .replace("‘", "'")
       .replace("’", "'")
    )
    
    # Step 2: Load JSON safely
    match = re.search(r'\{.*?\}', raw, re.DOTALL)

    if match:
        json_str = match.group(0)
    else:
        print("❌ No valid JSON object found in model output.")
        print("🔎 Raw content was:\n", raw)
        json_str = "{}"  # fallback to empty JSON to avoid crash
    json_str = escape_inner_quotes(json_str)
    print(json_str)
    full_response = json.loads(json_str)
    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "left_vote":row["left_vote"],
        "right_vote":row["right_vote"],
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


{ "Judgment": "left" , "Reasons": "The left image shows more liveliness due to the presence of crowds, people walking or biking, street vendors, bright signage, open businesses, and dynamic movement. These features contribute to a lively atmosphere. In contrast, the right image appears quiet, empty, and static, lacking these essential elements that define a lively scene." }
{ "Judgment": "left" , "Reasons": "The left image appears safer due to the following features: well-lit areas, visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs. These features are in line with the provided rules that suggest safety. On the other hand, the right image lacks these features, instead showing broken lights, isolated alleys, graffiti, and damaged property, which are indicative of unsafety." }
{ "Judgment": "left" , "Reasons": "The left image appears safer due to the following features: Well-lit areas (the street is well-illuminated), visible pedestrians, cle