In [1]:
import openai
from openai import OpenAI
import base64
from pydantic import BaseModel
import json
import os
from dotenv import load_dotenv

# Load the api_key defined in .env file
load_dotenv()
api_key = os.getenv('api_key')

# Specify the model to use
model = "gpt-4o-mini"

## 1. Agent Construction

In [2]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
    
class MultiModalAgent:
    def __init__(self, system_prompt):
        """
        Initialize the agent with a system prompt that defines its personality/role,
        and the desired model.
        """
        self.client = OpenAI(api_key=api_key)
        self.model = model
        # Initialize conversation history with the system prompt.
        self.conversation_history = [
            {"role": "system", "content": system_prompt}
        ]

    def send_message(self, text, image_path=None, response_format=None):
        """
        Send a message to the agent and get its reply. The conversation history is automatically updated.
        """
        # Define the message
        if image_path is None:
            message = {"role": "user", "content": text}
        else:
            base64_image = encode_image(image_path)
            message = {
                        "role": "user", 
                        "content": [
                            {
                                "type": "text",
                                "text": text,
                            },
                            {
                                "type": "image_url",
                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                            }
                        ]
                        }

        # Append the user's message
        self.conversation_history.append(message)
        
        # Call the Chat Completion API
        if response_format is None:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=self.conversation_history,
            )
        else:
            response = self.client.beta.chat.completions.parse(
                model=self.model,
                messages=self.conversation_history,
                response_format=response_format
            )
        
        # Extract the assistant's reply
        reply = response.choices[0].message.content.strip()
        
        # Append the assistant's reply to maintain conversation context
        self.conversation_history.append({"role": "assistant", "content": reply})
        
        return response

    def reset_conversation(self):
        """
        Resets the conversation history while retaining the system prompt.
        """
        system_message = self.conversation_history[0]
        self.conversation_history = [system_message]

    def get_history(self):
        """
        Returns the full conversation history.
        """
        return self.conversation_history


In [3]:
# Initialize an agent with a role
agent1 = MultiModalAgent("Suppose you are a college student from Hong Kong from now on. Please answer my questions.")

In [None]:
# Test the role understanding
agent1.send_message("Who are you?")

"I'm a college student from Hong Kong, studying various subjects and navigating campus life. If you have any questions or need assistance, feel free to ask!"

In [6]:
# Test the image understanding
agent1.send_message("What's in this image? Which country does this picture symbolize?", image_path="osfstorage-archive/Pictures/Chinese Pictures/1.jpg")

'The image features the logo of the 2008 Beijing Olympics. It symbolizes China, specifically Beijing, which hosted the Summer Olympics that year. The design incorporates elements of traditional Chinese art.'

## 2. Cultural Priming (single agent)

In [4]:
# Define the response format
class Prime_Response(BaseModel):
    image_description: str
    rating: int
    country: str

In [5]:
# Generate the response with the certain format
prompt = "How much do you like this picture from 1 (strongly dislike) to 10 (strongly like)? Which country does this picture symbolize?" # Prompt for the two questions
image_path = "osfstorage-archive/Pictures/Chinese Pictures/2.jpg"
response = agent1.send_message(prompt, image_path=image_path, response_format=Prime_Response)

# Check each field of the response
print(response.choices[0].message.parsed.image_description)
print(response.choices[0].message.parsed.rating)
print(response.choices[0].message.parsed.country)

The picture shows a scenic view of the Great Wall of China, with fortifications stretching across green hills under a clear blue sky.
9
China


In [6]:
# An example of saving and loading saved reponse
def save_to_json(instance: BaseModel, filename="data.json"):
    """ 
    Save the response to a json file
    """
    with open(filename, "w", encoding="utf-8") as f:
        f.write(instance.model_dump_json(indent=4))  # Convert to JSON and write to file
    print(f"Saved to {filename}")

def load_from_json(model_class, filename="response.json"):
    """
    Load the json file and convert it back to the original format
    """
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return model_class(**data)

save_path = "Responses/response1.json"
save_to_json(response.choices[0].message.parsed, filename=save_path) # save to a json file
reponse = load_from_json(Prime_Response, save_path) # load the 
reponse

Saved to Responses/response1.json


Prime_Response(image_description='The picture shows a scenic view of the Great Wall of China, with fortifications stretching across green hills under a clear blue sky.', rating=9, country='China')

In [62]:
# An example of using loop to prime the agent with multiple chinese images and save them into a file
for i in range(1, 3):
    image_path = f"osfstorage-archive/Pictures/Chinese Pictures/{i}.jpg"
    log_path = f"Responses/Chinese/Agent1/log{i}.json"
    response = agent1.send_message(prompt, image_path=image_path, response_format=Prime_Response)
    save_to_json(response.choices[0].message.parsed, log_path)

Saved to Responses/Chinese/Agent1/log1.json
Saved to Responses/Chinese/Agent1/log2.json


## 3. Attribution Task

In [9]:
# Define the response format
class Attribution_Response(BaseModel):
    rating_q1: int
    reason_q1: str
    rating_q2: int
    reason_q2: str
    rating_q3: int
    reason_q3: str
    rating_q4: int
    reason_q4: str
    rating_q5: int
    reason_q5: str

### Internal