<a href="https://colab.research.google.com/github/yongug/Fly_python/blob/main/deep_qlearn_reversi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Required packages & your custum environment

In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/953.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m952.3/953.9 kB[0m [31m16.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r drive/MyDrive/gym_examples /content

# Importations & constants

In [None]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torchvision.transforms as T

# Configuration paramaters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = epsilon_max - epsilon_min # Rate at which to reduce chance
                                             # of random action being taken
batch_size = 16  # Size of batch taken from replay buffer
max_steps_per_episode = 60
max_episodes = 5000

num_actions = 64

In [None]:
# Experience replay buffers
action_history = []
action_mask_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []

# Variables for counting over episodes
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 1000
# Number of frames for exploration
epsilon_greedy_frames = 10000.0
# Maximum replay length
max_memory_length = 500000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000

# Loading gym environment

In [None]:
env = gym.make('gym_examples:gym_examples/Reversi-v0', render_mode="text")

# Preprocessing

In [None]:
# Function to preprocess the state
def preprocess_state(env_observ):
    st = torch.from_numpy(env_observ).squeeze()
    st = st.to(torch.int64)
    st = torch.nn.functional.one_hot(st,num_classes=3)
    st = st.permute(2, 0, 1)
    return st.to(torch.float32)

# Model definition

In [None]:
class QModel(nn.Module):
    def __init__(self, num_actions):
        super(QModel, self).__init__()
        self.dropout = nn.Dropout(p=0.3)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=1, padding='same')
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=1, padding='same')
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(1152, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = self.dropout(x)
        x = nn.functional.relu(self.conv3(x))
        x = self.flatten(x)
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        action = self.fc2(x)
        return action

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# The first model makes the predictions for Q-values which are used to
# make a action.
model = QModel(num_actions)
model.to(device)

# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = QModel(num_actions)
model_target.to(device)

loss_function = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00025)

In [None]:
device

device(type='cuda', index=0)

# Policies

In [None]:
# Function to select an action
# model: the torch model to compuate action-state value (i.e., q-value)
# state: a torch tensor (3 x 8 x 8) of float32, which is output by preprocess_state
# mask: a 64-size array (np.array)
def get_greedy_epsilon(model, state, mask):
    global epsilon

    #if frame_count < epsilon_random_frames or np.random.rand(1)[0] < epsilon:
    if np.random.rand(1)[0] < epsilon:
        action = np.random.choice([ i for i in range(num_actions) if mask[i] == 1 ])
    else:
        with torch.no_grad():
            # add a batch axis
            state_tensor = state.unsqueeze(0)
            # compute the q-values
            q_values = model(state_tensor)
            # select the q-values of valid actions
            action = torch.argmax(
                q_values.to('cpu').squeeze() + torch.from_numpy(mask) * 100., # trick to select a valid action
                dim=0)

    # decay epsilon
    epsilon -= epsilon_interval / epsilon_greedy_frames
    epsilon = max(epsilon, epsilon_min)

    return action

In [None]:
def get_greedy_action(model, state, mask):
    global epsilon

    with torch.no_grad():
        state_tensor = state.unsqueeze(0) # batch dimension
        q_values = model(state_tensor)

        action = torch.argmax(
                q_values.to('cpu').squeeze() + torch.from_numpy(mask) * 100., # trick to select a valid action
                dim=0)

    return action

# Replay Buffer Management

In [None]:
# sample a batch of _batch_size from replay buffers
# return numpy.ndarrays
def sample_batch(_batch_size):
    # Get indices of samples for replay buffers
    indices = np.random.choice(range(len(done_history)), size=_batch_size, replace=False)

    state_sample = np.array([state_history[i].squeeze(0).numpy() for i in indices])
    state_next_sample = np.array([state_next_history[i].squeeze(0).numpy() for i in indices])
    rewards_sample = np.array([rewards_history[i] for i in indices], dtype=np.float32)
    action_sample = np.array([action_history[i] for i in indices])

    # action mask is the mask for the valid actions at the '''next''' state
    action_mask_sample = np.array([action_mask_history[i] for i in indices])
    done_sample = np.array([float(done_history[i]) for i in indices])

    return state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample

In [None]:
def append_history(state, state_next, reward, action, action_mask, done):
    # Save actions and states in replay buffer
    action_history.append(action)
    action_mask_history.append(action_mask)
    state_history.append(state)
    state_next_history.append(state_next)
    rewards_history.append(reward)
    done_history.append(done)

In [None]:
# Function to update the Q-network
def update_network():
    # sample a batch of ...
    state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample = \
        sample_batch(batch_size)

    # Convert numpy arrays to PyTorch tensors
    state_sample = torch.tensor(state_sample, dtype=torch.float32).to(device)
    state_next_sample = torch.tensor(state_next_sample, dtype=torch.float32).to(device)
    action_sample = torch.tensor(action_sample, dtype=torch.int64).to(device)
    action_mask_sample = torch.tensor(action_mask_sample, dtype=torch.int64).to(device)
    rewards_sample = torch.tensor(rewards_sample, dtype=torch.float32).to(device)
    done_sample = torch.tensor(done_sample, dtype=torch.float32).to(device)

    # Compute the target Q-values for the states
    with torch.no_grad():
        future_rewards = model_target(state_next_sample)
        #future_rewards = future_rewards.cpu()

        # compute the q-value for the next state and the action maximizing the q-value
        # note: the action should be valid (i.e., mask is set to 1)
        max_q_values = torch.max(
            future_rewards + action_mask_sample * 100., # trick to select a valid action
            dim=1).values.detach() - 100.

        # compute the target q-value
        # if the step was final, max_q_values should not be added
        # we assume that the negative return of the opposite player is the return of next step
        # that is, G(t) = r(t+1) - g*r(t+2) + g^2*r(t+3) - g^3*r(t+4) + ...
        target_q_values = rewards_sample + gamma * max_q_values * (1. - done_sample)

    # It's forward propagation! Compute the Q-values for the taken actions
    q_values = model(state_sample)
    #q_values = q_values.cpu()
    q_values_action = q_values.gather(dim=1, index=action_sample.unsqueeze(1)).squeeze(1)

    # Compute the loss
    loss = loss_function(q_values_action, target_q_values)

    # Perform the optimization step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Run DQN Tranining

In [None]:
# Experience replay buffers
action_history = []
action_mask_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []

In [None]:
for _ in range(max_episodes):
    state, info = env.reset()
    state = preprocess_state(state)
    action_mask = info['action_mask'].reshape((-1,))
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        # Select an action
        #state_cuda = state.to(device)
        action = get_greedy_epsilon(model,
                      state.to(device),
                      action_mask)
        if action < 0:
            print(action_mask)

        # Take the selected action
        state_next, reward, done, _, info = env.step((action // 8, action % 8))
        state_next = preprocess_state(state_next)
        action_mask = info['action_mask'].reshape((-1,))

        episode_reward += reward

        # Store the transition in the replay buffer
        append_history(state, state_next, reward, action, action_mask, done)

        state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            update_network()

        if frame_count % update_target_network == 0:
            model_target.load_state_dict(model.state_dict())

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del action_mask_history[:1]
            del done_history[:1]

        if done:
            break

    episode_count += 1
    episode_reward_history.append(episode_reward)

    # Update running reward to check condition for solving
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    if episode_count % 10 == 0:
        print(f"Episode: {episode_count}, Frame count: {frame_count}, Running reward: {running_reward}")

    if episode_count % 5000 == 0:
        torch.save(model, 'model.{}'.format(episode_count))
    #if running_reward > 20:
    #    print(f"Solved at episode {episode_count}!")
    #    break


torch.save(model, 'model.final')

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Episode: 10, Frame count: 299, Running reward: -73.0
Episode: 20, Frame count: 597, Running reward: -50.55
Episode: 30, Frame count: 895, Running reward: -57.766666666666666
Episode: 40, Frame count: 1195, Running reward: -64.525
Episode: 50, Frame count: 1493, Running reward: -64.18
Episode: 60, Frame count: 1791, Running reward: -64.05
Episode: 70, Frame count: 2068, Running reward: -59.4
Episode: 80, Frame count: 2367, Running reward: -60.9875
Episode: 90, Frame count: 2667, Running reward: -59.766666666666666
Episode: 100, Frame count: 2964, Running reward: -55.37
Episode: 110, Frame count: 3263, Running reward: -53.13
Episode: 120, Frame count: 3561, Running reward: -56.47
Episode: 130, Frame count: 3832, Running reward: -51.17
Episode: 140, Frame count: 4132, Running reward: -44.71
Episode: 150, Frame count: 4430, Running reward: -41.37
Episode: 160, Frame count: 4725, Running reward: -39.98
Episode: 170, Frame count: 5025, Running reward: -41.89
Episode: 180, Frame count: 5323, 

AssertionError: 

In [None]:
env.render()

Current state of the board:
    0 1 2 3 4 5 6 7
-------------------
0 | 2 2 2 2 2 2 2 2
1 | 2 2 1 1 1 1 2 2
2 | 2 1 2 2 2 1 2 2
3 | 2 1 2 2 2 2 2 2
4 | 2 2 1 1 1 2 2 2
5 | 2 2 2 2 2 2 2 2
6 | 2 2 2 2 1 1 1 2
7 | 1 . 1 1 1 1 . 1
<class 'str'>


In [None]:
torch.save(model.cpu().state_dict(), 'model')

In [None]:
!ls

drive  gym_examples  model  sample_data


In [None]:
!cp model drive/MyDrive/SKT

# Evaluation (Agent vs. Gym's random play)

In [None]:
import time, sys
from IPython.display import clear_output

board, info = env.reset()
state = preprocess_state(board)
action_mask = info['action_mask'].reshape((-1,))
done = False
env.render()

while not done:
    action = get_greedy_action(model, state.to(device), action_mask)
    print("action: ({}, {})".format(action // 8, action % 8))
    sys.stdout.flush()

    time.sleep(1.0)
    clear_output(wait=False)
    board, reward, done, _, info = env.step((action // 8, action % 8))
    state = preprocess_state(board)
    action_mask = info['action_mask'].reshape((-1,))
    env.render()

Current state of the board:
    0 1 2 3 4 5 6 7
-------------------
0 | . . . . . . . .
1 | . . . . . . . .
2 | . . . . . . . .
3 | . . . 1 2 . . .
4 | . . . 2 1 . . .
5 | . . . . . . . .
6 | . . . . . . . .
7 | . . . . . . . .
<class 'str'>


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

# Evaluation (Agent vs. Human)

In [None]:
board, info = env.reset()
state = preprocess_state(board)
action_mask = info['action_mask'].reshape((-1,))
done = False
env.render()

while not done:
    action = get_greedy_action(model, state.to(device), action_mask)
    print("action: ({}, {})".format(action // 8, action % 8))
    sys.stdout.flush()

    time.sleep(1.0)
    clear_output(wait=False)
    board, reward, done, _, info = env.step((action // 8, action % 8))
    state = preprocess_state(board)
    action_mask = info['action_mask'].reshape((-1,))
    env.render()