2023 Takahiro Shinozaki @ Tokyo Tech

Contributors: Ryota Komatsu, Zhou Zehua, Tingyuan Zhu, Kota Kawakita

Notebook for Google Colab


Learning Task:

We (i.e., the environment) show a picture to the agent. The picture contains one to three food objects. If the agent correctly answers the number of objects, it gets a positive reward.

Agent:
We use Deep Q-Network (DQN) to implement the agent. DQN is a Q-learning method, which is a value-based method. It learns the action-value function implemented by a neural network that estimates the value of taking a discrete action at a state. At each state, we can find the best action from the action-value function as the action that gives the largest value.

Stable Baselines3 (SB3) is a RL library.

In [None]:
%pip install stable-baselines3

In [None]:
from glob import glob
import os
import random
from typing import Tuple
import zipfile

import gymnasium as gym
from huggingface_hub import hf_hub_download
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from stable_baselines3.dqn.dqn import DQN
from stable_baselines3.common.evaluation import evaluate_policy
import warnings
warnings.filterwarnings("ignore")

## Download food images

In [None]:
file_path = hf_hub_download(
    repo_id="tttslab/spolacq_dataset",
    repo_type="dataset",
    filename="./data.zip"
)
extract_dir = "./"

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

os.remove(file_path)

## Environment



We implement the environment using OpenAI Gym

In [None]:
class Environment(gym.Env):
    """
    Args:
        food (str): one of foods: "apple", "banana", "carrot", "cherry", "cucumber",
        "egg", "eggplant", "green_pepper", "hyacinth_bean", "kiwi_fruit",
        "lemon", "onion", "orange", "potato", "sliced_bread", "small_cabbage",
        "strawberry", "sweet_potato", "tomato", and "white_radish".
        split (str): dataset split. "train" or "test".
    """
    def __init__(self, food: str = "apple", split: str = "train"):
        super().__init__()
        self.action_space = gym.spaces.Discrete(3)  #number of foods {0, 1, 2}
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(224, 224, 3), dtype=np.uint8)  # RGB image

        assert split in ["train", "test"], "dataset split must be 'train' or 'test'"

        self.dataset = []
        for num_of_foods in range(1, 4):
            paths = glob(f"data/dataset/{food}/{split}_number{num_of_foods}/group*_*.jpg")
            for path in paths:
                image = Image.open(path)
                image = np.array(image)

                self.dataset.append((image, num_of_foods-1))

        self.reset()

    def reset(self, seed: int | None = None) -> np.ndarray:
        state, num_of_foods = random.choice(self.dataset)
        self.correct_answer = num_of_foods
        return state, {}

    def step(self, action) -> Tuple[np.ndarray, int, bool, dict]:

        if action == self.correct_answer:
            reward = 1
        else:
            reward = 0

        # Update state
        new_state, num_of_foods = random.choice(self.dataset)
        self.correct_answer = num_of_foods

        return new_state, reward, True, True, dict()

In [None]:
food_name = "apple"
env = Environment(food = food_name, split="train")

agent = DQN(
    "CnnPolicy",
    env,
    buffer_size=100,
    learning_starts=0,
    verbose=1, #verbose=1 means printing information during training, verbose=0 means these information will not be displayed
    )

### initialized agent

In [None]:
# Initialize state
state, info = env.reset()
plt.imshow(state)

# Agent gets an environment state and returns a decided action
action, _ = agent.predict(state, deterministic=True)
print(f"Agent's answer: {action+1}")

# Environment gets an action from the agent, proceeds the time step,
# and returns the new state and reward etc.
state, reward, terminated, truncated, info = env.step(action)
print(f"Reward: {reward}")

In [None]:
agent.learn(total_timesteps=2000)

## Test agent

In [None]:
test_env = Environment(food = food_name, split="test")

### Run the following cell several times

In [None]:
# Initialize state
state, info = test_env.reset()
plt.imshow(state)

# Agent gets an environment state and returns a decided action
action, _ = agent.predict(state, deterministic=True)
print(f"Agent's answer: {action+1}")

# Environment gets an action from the agent, proceeds the time step,
# and returns the new state and reward etc.
state, reward, terminated, truncated, info = test_env.step(action)
print(f"Reward: {reward}")

###Evaluation

In [None]:
#evaluate_policy() returns the mean and std of the rewards of the our trained model

mean_reward, std_reward = evaluate_policy(agent, env, n_eval_episodes=20, render=False)
env.close()
print(f"Mean reward: {mean_reward}")
print(f"Std reward: {std_reward}")
