In [3]:
import gym
import numpy as np

class SailingEnv(gym.Env):
    def __init__(self):
        self.action_space = gym.spaces.Discrete(3)
        self.observation_space = gym.spaces.Box(low=0, high=10, shape=(2,), dtype=np.int64)
        self.wind_direction = np.array([0, 1])
        self.position = np.array([0, 0])
        self.rudder_angle = 0
        self.target = np.array([8.0, 9.0])

    def reset(self):
        self.wind_direction = np.array([0, 1])
        self.position = np.array([0, 0])
        self.rudder_angle = 0
        return self.position

    def step(self, action):
        if action == 0:
            self.rudder_angle = -1
        elif action == 1:
            self.rudder_angle = 0
        else:
            self.rudder_angle = 1
        
        wind_angle = np.arctan2(self.wind_direction[1], self.wind_direction[0])
        boat_velocity = self.wind_direction - 5 * np.array([np.sin(wind_angle + np.deg2rad(self.rudder_angle)), np.cos(wind_angle + np.deg2rad(self.rudder_angle))])
        relative_velocity = boat_velocity - self.wind_direction
        boat_direction = np.arctan2(relative_velocity[1], relative_velocity[0])
        heading_angle = boat_direction + np.deg2rad(self.rudder_angle)
        self.position += boat_velocity.astype(int)
        self.wind_direction = np.array([0, 1], dtype=float)

        reward = np.cos(heading_angle - wind_angle)
        done = self.position[0] >= 10 or self.position[1] >= 10 or self.position[0] < 0 or self.position[1] < 0
        return self.position, reward, done, {}

env = SailingEnv()

# Q-learning algorithm
num_episodes = 1000
alpha = 0.5
gamma = 0.9
epsilon = 0.1
#q_table will have 10X10X4 cells
q_table = np.random.uniform(low=-2, high=0, size=([10,10] + [env.action_space.n]))
LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPSILON=0.9

for i in range(num_episodes):
    done = False
    episode_reward = 0
    observation = env.reset()
    
    while not done:
        if np.random.random() < epsilon:
            # take a random action with probability epsilon
            action = env.action_space.sample()
        else:
            # choose the action with the highest Q-value
            action = np.argmax(q_table[observation[0], observation[1], :])
        
        next_observation, reward, done, _ = env.step(action)
        next_action = np.argmax(q_table[next_observation[0], next_observation[1], :])
        
        # update Q-value using Q-learning update rule
        q_table[observation[0], observation[1], action] += alpha * (reward + gamma * q_table[next_observation[0], next_observation[1], next_action] - q_table[observation[0], observation[1], action])
        observation = next_observation
        episode_reward += reward
    
    if (i + 1) % 100 == 0:
        print(f"Episode {i+1}/{num_episodes}, Reward: {episode_reward}")
        
print("Training complete.")
print("Final Q-table:")
print(q_table)



Episode 100/1000, Reward: -1.8369701987210297e-16
Episode 200/1000, Reward: -1.8369701987210297e-16
Episode 300/1000, Reward: -1.8369701987210297e-16
Episode 400/1000, Reward: -1.8369701987210297e-16
Episode 500/1000, Reward: -1.8369701987210297e-16
Episode 600/1000, Reward: -1.8369701987210297e-16
Episode 700/1000, Reward: -1.8369701987210297e-16
Episode 800/1000, Reward: -1.8369701987210297e-16
Episode 900/1000, Reward: -1.8369701987210297e-16
Episode 1000/1000, Reward: -1.8369701987210297e-16
Training complete.
Final Q-table:
[[[-1.07383553e+00 -1.03902092e-01 -8.79775665e-01]
  [-1.96599595e+00 -1.83907501e+00 -1.33296595e+00]
  [-7.32455449e-01 -1.90879683e+00 -6.31284365e-01]
  [-5.80859749e-01 -2.73254241e-01 -1.36776711e-02]
  [-1.08758387e+00 -1.37698212e+00 -7.65619409e-02]
  [-1.82147000e+00 -5.06613582e-01 -1.95780735e+00]
  [-4.75534790e-01 -6.64356370e-02 -1.17881811e-02]
  [-1.67323887e+00 -1.80942689e+00 -9.31844381e-01]
  [-1.61376664e-01 -1.17437777e+00 -4.35036897e-0

AttributeError: 'SailingEnv' object has no attribute 'current_step'

In this specific implementation, the agent randomly chooses actions using the env.action_space.sample() method. So, the output will differ each time you run the code. The important thing is to observe whether the total episode reward increases or not as the agent learns.

 Q-learning algorithm inside the main loop of your original code. The Q-table is initialized with zeros