In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from tensorflow.keras import optimizers, callbacks, losses, layers
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input
from tensorflow.keras.models import Model, Sequential, load_model

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)

def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

In [3]:
def DenseBlock(n, activation='relu', bn=False):
    def f(x):
        x = Dense(n, kernel_initializer='he_normal')(x)
        if bn:
            x = BatchNormalization()(x)
        x = Activation(activation)(x)
        return x
    return f

In [4]:
# gamma = 0.9  # Discount factor for past rewards
# epsilon = 1.0  # Epsilon greedy parameter
# epsilon_min = 0.1  # Minimum epsilon greedy parameter
# epsilon_max = 1.0  # Maximum epsilon greedy parameter
# epsilon_interval = (
#     epsilon_max - epsilon_min
# )  # Rate at which to reduce chance of random action being taken
# batch_size = 32  # Size of batch taken from replay buffer
# max_steps_per_episode = 500

In [5]:
import gym

env = gym.make("CartPole-v1")
env.seed(SEED)

num_actions = env.action_space.n
num_states = env.observation_space.shape[0]

In [6]:
env.observation_space.shape

(4,)

In [7]:
from collections import deque

class Memory:
    def __init__(self, capacity):
        self.actions = deque(maxlen=capacity)
        self.states = deque(maxlen=capacity)
        self.states_nexts = deque(maxlen=capacity)
        self.rewards = deque(maxlen=capacity)
        self.dones = deque(maxlen=capacity)
    
    def store(self, obs): # obs: s, a, r, s'
        self.states.append(obs[0])
        self.actions.append(obs[1])
        self.rewards.append(obs[2])
        self.states_nexts.append(obs[3])
        self.dones.append(obs[4])
    
    def extract(self, batch_size):
        idx = np.random.choice(range(self.get_len()), batch_size, replace=False)
        obs = [
            np.array(self.states)[idx],
            np.array(self.actions)[idx],
            np.array(self.rewards)[idx],
            np.array(self.states_nexts)[idx],
            np.array(self.dones)[idx],
        ]
        
        return obs

    def get_len(self):
        return len(self.actions)

In [8]:
class Agent:
    def __init__(self, memory, num_states=num_states):
        self.gamma = 0.95
        self.memory = memory
        
        self.eps = 1
        self.eps_max = 1
        self.eps_min = 0.01
        self.eps_interval = self.eps_max - self.eps_min
        self.eps_steps = 250
        self.steps = 0

        self.net = self.build_net(num_states)
        self.target_net = self.build_net(num_states)
        self.target_net.set_weights(self.net.get_weights())
        
        self.opt = optimizers.Adam(1e-3)
        self.loss_fn = losses.MeanSquaredError()

    def build_net(self, num_states):
        inputs = Input(shape = (num_states, ))
        x = DenseBlock(64)(inputs)
        x = DenseBlock(32)(x)
        
        outputs = Dense(2)(x)
        return Model(inputs, outputs)

    def train(self, batch_size):
        obs = self.memory.extract(batch_size)
        
        state = tf.convert_to_tensor(obs[0])
        action = tf.convert_to_tensor(obs[1])
        reward = tf.convert_to_tensor(obs[2])
        state_next = tf.convert_to_tensor(obs[3])
        done = tf.convert_to_tensor(obs[4])
        
        futurue_reward = self.target_net.predict(state_next)
#         next_action = tf.argmax(target_q, axis=1)
        updated_q_values = reward + self.gamma*tf.reduce_max(futurue_reward, axis=1)
        updated_q_values = updated_q_values * (1. - done) - done
        
        masks = tf.one_hot(action, num_actions)
        
        with tf.GradientTape() as tape:
            q_values = self.net(state)
            q_actions = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            
            loss = self.loss_fn(updated_q_values, q_actions)
            
        grads = tape.gradient(loss, self.net.trainable_weights)
        self.opt.apply_gradients(zip(grads, self.net.trainable_weights))
        
        self.steps += 1  
        
    def policy(self, state):
        rn = np.random.random()
        if rn < self.eps:
            action =  np.random.randint(0, num_actions)
        else:
            action = np.argmax(self.net(state))
            
        self.eps = max(self.eps_min, self.eps_max-self.eps_interval*self.steps/self.eps_steps)
        return action

In [9]:
n_episode = 150

batch_size = 32
ep_reward_list = []

memory = Memory(2000)
agent = Agent(memory)

for e in range(n_episode):
    state = env.reset()
    episode_reward = 0
    
    steps = 0
    while 1:
        state_ = tf.expand_dims(tf.convert_to_tensor(state), 0)
        action = agent.policy(state_)
        
        state_next, reward, done, _ = env.step(action)
        reward = tf.cast(reward, dtype=tf.float32)
        done = tf.cast(done, dtype=tf.float32)
        agent.memory.store((state, action, reward, state_next, done))
        
        episode_reward += reward
        
        state = state_next
        
        if agent.memory.get_len() > batch_size:
            agent.train(batch_size)

        if done or steps>200:
            agent.target_net.set_weights(agent.net.get_weights())
            break
            
        steps += 1
    ep_reward_list.append(episode_reward)
    print(f'episode {e} reward: {episode_reward}, {agent.eps}, {agent.steps}')

episode 0 reward: 29.0, 1.0, 0
episode 1 reward: 8.0, 0.98416, 5
episode 2 reward: 17.0, 0.91684, 22
episode 3 reward: 21.0, 0.83368, 43
episode 4 reward: 11.0, 0.79012, 54
episode 5 reward: 19.0, 0.71488, 73
episode 6 reward: 19.0, 0.63964, 92
episode 7 reward: 14.0, 0.5842, 106
episode 8 reward: 9.0, 0.5485599999999999, 115
episode 9 reward: 16.0, 0.4852000000000001, 131
episode 10 reward: 20.0, 0.406, 151
episode 11 reward: 11.0, 0.3624400000000001, 162
episode 12 reward: 10.0, 0.32284, 172
episode 13 reward: 12.0, 0.27532, 184
episode 14 reward: 11.0, 0.23175999999999997, 195
episode 15 reward: 10.0, 0.19216, 205
episode 16 reward: 11.0, 0.14860000000000007, 216
episode 17 reward: 10.0, 0.10899999999999999, 226
episode 18 reward: 10.0, 0.06940000000000002, 236
episode 19 reward: 9.0, 0.03376000000000001, 245
episode 20 reward: 10.0, 0.01, 255
episode 21 reward: 10.0, 0.01, 265
episode 22 reward: 10.0, 0.01, 275
episode 23 reward: 10.0, 0.01, 285
episode 24 reward: 10.0, 0.01, 295
e