rewards are given when the cosine of the angle between the direction of the boat and that of the wind is maximized

In [3]:
import gym
from gym import spaces
import numpy as np

class SailingEnv(gym.Env):
    def __init__(self):
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=0, high=10, shape=(2,), dtype=np.int64)
        self.wind_direction = np.array([0, 1])
        self.position = np.array([0, 0])
        self.rudder_angle = 0

    def reset(self):
        self.wind_direction = np.array([0, 1])
        self.position = np.array([0, 0])
        self.rudder_angle = 0
        return self.position

    def step(self, action):
        if action == 0:
            self.rudder_angle = -1
        elif action == 1:
            self.rudder_angle = 0
        else:
            self.rudder_angle = 1
        
        wind_angle = np.arctan2(self.wind_direction[1], self.wind_direction[0])
        boat_velocity = self.wind_direction - 5 * np.array([np.sin(wind_angle + np.deg2rad(self.rudder_angle)), np.cos(wind_angle + np.deg2rad(self.rudder_angle))])
        relative_velocity = boat_velocity - self.wind_direction
        boat_direction = np.arctan2(relative_velocity[1], relative_velocity[0])
        heading_angle = boat_direction + np.deg2rad(self.rudder_angle)
        self.position += boat_velocity.astype(int)
        self.wind_direction = np.array([0, 1], dtype=float)

        reward = np.cos(heading_angle - wind_angle)
        done = self.position[0] >= 10 or self.position[1] >= 10 or self.position[0] < 0 or self.position[1] < 0
        return self.position, reward, done, {}

env = SailingEnv()
num_episodes = 10000

for i in range(num_episodes):
    done = False
    episode_reward = 0
    observation = env.reset()
    print(f"\nEpisode {i+1}")
    print(f"Initial Location: {observation}, Wind Direction: {env.wind_direction}, Rudder Angle: {env.rudder_angle}")
    
    while not done:
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        episode_reward += reward
        print(f"Location: {observation}, Wind Direction: {env.wind_direction}, Rudder Angle: {env.rudder_angle}")
    
    print(f"Total episode reward: {episode_reward}")
