In [49]:
from openai import OpenAI
import os
import pandas as pd
import base64
import io
from PIL import Image
import os
import json

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
print(client.models.list())
model_name = 'gemma-3-27b-it'
output_dir = f"results_{model_name}"
character_schema = {
    "type": "json_schema",
    "json_schema": {
        "name": "Judgment",
        "schema": {
            "type": "object",
            "properties": {
                "Judgment": {"type": "string"},
                "Reasons": {"type": "string"}
            },
            "required": ["Judgment","Reasons"]
        },
    }
}

# Create the folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

SyncPage[Model](data=[Model(id='gemma-3-27b-it', created=None, object='model', owned_by='organization_owner'), Model(id='gemma-3-4b-it-qat', created=None, object='model', owned_by='organization_owner'), Model(id='llava-v1.5-7b', created=None, object='model', owned_by='organization_owner'), Model(id='text-embedding-nomic-embed-text-v1.5@q4_k_m', created=None, object='model', owned_by='organization_owner'), Model(id='granite-vision-3.2-2b', created=None, object='model', owned_by='organization_owner'), Model(id='gemma-2-9b-it', created=None, object='model', owned_by='organization_owner'), Model(id='text-embedding-nomic-embed-text-v1.5@q8_0', created=None, object='model', owned_by='organization_owner')], object='list')


### Zero-shot1

In [None]:
# Load metadata
prompt_method = "Zero-shot1-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose LLM prompt
    prompt_text = f"""
                    You are shown a side-by-side image with two street views at two different cities: the left half at {left_place} and the right half at {right_place}.
                    Which side looks more {study_question}?

                    Answer with only one word: "left", "right", or "equal". Then explain your reasoning.

                    Format:
                    Judgment. Reasons.
                    """

    # Call LLM 
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    full_response = json.loads(response.choices[0].message.content)

    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")

✅ Accuracy (all): 59.00%
✅ Accuracy (excluding 'equal'): 64.13%


### Zero-Shot2

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "Zero_Shot2-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    prompt_text = f"""
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}. Based on the overall impression, decide which image better reflects the following quality:

        "{study_question}"

        Respond with a one-word judgment: left, right, or equal. Then explain your reasoning.

        Format:
        Judgment. Reasons.
        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    full_response = json.loads(response.choices[0].message.content)

    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")


✅ Accuracy (all): 58.00%
✅ Accuracy (excluding 'equal'): 62.37%


### Chain-of-Thought

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "COT-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    prompt_text = f"""
        Let’s think step by step.
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}. 
        Based on your overall impression, determine which one better reflects the following quality:

        "{study_question}"

        What features do you see in the left image? What features in the left image contribute to or detract from that quality?

        What features do you see in the right image? What features in the right image contribute to or detract from that quality?

        Based on your reasoning, which image better reflects the quality?

        Respond with a one-word judgment: left, right, or equal. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    full_response = json.loads(response.choices[0].message.content)

    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


✅ Accuracy (all): 60.00%
✅ Accuracy (excluding 'equal'): 64.52%
✅ Output saved to llm_predictions.csv


### Rule-Base + In-context learning

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "RBIL-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

def get_visual_perspective(study_question):
    rules = {
        "wealthier": (
            "Look for expensive cars, clean sidewalks, modern buildings, well-maintained facades, greenery, and overall tidiness. "
            "Signs of poverty such as trash, broken sidewalks, and older buildings detract from the feeling of wealth."
        ),
        "more beautiful": (
            "Aesthetics matter: symmetry, architectural design, vibrant colors, trees, flowers, sunlight, and open space. "
            "Avoid cluttered, gray, damaged, or visually unappealing features."
        ),
        "livelier": (
            "Look for crowds, people walking or biking, street vendors, bright signage, open businesses, and dynamic movement. "
            "Quiet, empty, or static scenes are less lively."
        ),
        "more depressing": (
            "Indicators include dark lighting, gray tones, boarded-up buildings, graffiti, trash, empty streets, and general neglect. "
            "Fewer people and lack of activity can amplify the depressing feeling."
        ),
        "safer": (
            "Well-lit areas, visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs suggest safety. "
            "Broken lights, isolated alleys, graffiti, and damaged property indicate unsafety."
        ),
        "more boring": (
            "Uniform design, lack of people or variation, blank walls, closed businesses, and monotone architecture signal boredom. "
            "Diversity in design and visible life make a place less boring."
        )
    }

    return rules.get(study_question.lower(), "No rule found for this category.")

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    #print(left_place,right_place)
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    visual_perspective = get_visual_perspective(study_question) 
    #print(visual_perspective)
    prompt_text = f"""
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}.
        Your task is to decide which image better reflects the following quality:

        "{study_question}"

        Use the following rules when making your decision:

        "{visual_perspective}"
        
        Then, apply the rules to the images and respond with a one-word judgment: left, right, or equal. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    full_response = json.loads(response.choices[0].message.content)

    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


✅ Accuracy (all): 57.00%
✅ Accuracy (excluding 'equal'): 66.28%
✅ Output saved to llm_predictions.csv


### RBIL_COT

In [None]:
import pandas as pd
import base64
import io
from PIL import Image
import os

# Load metadata
prompt_method= "RBIL_COT-GEO"
metadata_df = pd.read_csv("merged_metadata.csv")
image_dir = "merged_images"
results = []

def get_visual_perspective(study_question):
    rules = {
        "wealthier": (
            "Look for expensive cars, clean sidewalks, modern buildings, well-maintained facades, greenery, and overall tidiness. "
            "Signs of poverty such as trash, broken sidewalks, and older buildings detract from the feeling of wealth."
        ),
        "more beautiful": (
            "Aesthetics matter: symmetry, architectural design, vibrant colors, trees, flowers, sunlight, and open space. "
            "Avoid cluttered, gray, damaged, or visually unappealing features."
        ),
        "livelier": (
            "Look for crowds, people walking or biking, street vendors, bright signage, open businesses, and dynamic movement. "
            "Quiet, empty, or static scenes are less lively."
        ),
        "more depressing": (
            "Indicators include dark lighting, gray tones, boarded-up buildings, graffiti, trash, empty streets, and general neglect. "
            "Fewer people and lack of activity can amplify the depressing feeling."
        ),
        "safer": (
            "Well-lit areas, visible pedestrians, clear pathways, greenery, maintained infrastructure, and surveillance signs suggest safety. "
            "Broken lights, isolated alleys, graffiti, and damaged property indicate unsafety."
        ),
        "more boring": (
            "Uniform design, lack of people or variation, blank walls, closed businesses, and monotone architecture signal boredom. "
            "Diversity in design and visible life make a place less boring."
        )
    }

    return rules.get(study_question.lower(), "No rule found for this category.")

for _, row in metadata_df.iterrows():
    merged_index = row["merged_index"]
    study_question = row["study_question"]
    left_place = row["place_name_left"]
    right_place = row["place_name_right"]
    ground_truth = str(row["choice"]).strip().lower()
    image_path = os.path.join(image_dir, f"merged_{merged_index:03d}.jpg")

    # Encode image to base64
    image = Image.open(image_path).convert("RGB")
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="JPEG")
    base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")

    # Compose updated prompt
    visual_perspective = get_visual_perspective(study_question) 
    #print(visual_perspective)
    prompt_text = f"""
        Let’s think step by step.
        Compare two street view images taken in different cities — the left image is from {left_place}, and the right image is from {right_place}.
        Based on your overall impression, determine which one better reflects the following quality:

        "{study_question}"

         What features do you see in the left image? What features in the left image contribute to or detract from that quality?

        What features do you see in the right image? What features in the right image contribute to or detract from that quality?

        Based on the following rules, which image has features that better reflect the quality?

        "{visual_perspective}"
        
        Then, apply the rules to the images and respond with a one-word judgment: left, right, or equal. Then explain your reasoning.

        """

    # Call LLM
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text.strip()},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                ],
            },
        ],
        response_format=character_schema,
    )
    full_response = json.loads(response.choices[0].message.content)

    # Split judgment and reasoning
    # Extract from dict
    model_judgement = full_response.get("Judgment", "").strip().lower()
    model_reason = full_response.get("Reasons", "").strip()

    results.append({
        "merged_index": merged_index,
        "left": row["left"],
        "right": row["right"],
        "study_question": study_question,
        "ground_truth": ground_truth,
        "model_judgement": model_judgement,
        "model_reason": model_reason,
        "validation": int(model_judgement == ground_truth)
    })

# Save results
df_result = pd.DataFrame(results)
df_result.to_csv(f"{output_dir}/llm_predictions_{model_name}_{prompt_method}.csv", index=False)

# Print accuracy
# Accuracy including all responses
accuracy_all = df_result["validation"].mean()

# Accuracy excluding any 'equal' in ground truth or model judgement
filtered_df = df_result[
    (df_result["ground_truth"] != "equal") & 
    (df_result["model_judgement"] != "equal")
]
accuracy_excl_equal = filtered_df["validation"].mean()

# Print both
print(f"✅ Accuracy (all): {accuracy_all:.2%}")
print(f"✅ Accuracy (excluding 'equal'): {accuracy_excl_equal:.2%}")
print("✅ Output saved to llm_predictions.csv")


✅ Accuracy (all): 56.00%
✅ Accuracy (excluding 'equal'): 65.12%
✅ Output saved to llm_predictions.csv
