### cartpole DQN

In [1]:
import numpy as np
import gym
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

In [2]:
class DQLAgent():
    def __init__(self, env):
        # input size
        self.state_size = env.observation_space.shape[0]

        # output size
        self.action_size = env.action_space.n

        self.gamma = 0.95
        self.learning_rate = 0.001       
        self.epsilon = 1 # 초기값
        self.epsilon_decay = 0.995 #확률 감소 옵션
        self.epsilon_min = 0.01
        self.memory = deque(maxlen = 1000)

        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = 'tanh'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(learning_rate = self.learning_rate))

        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        # 행동, 이용, 탐험
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state, verbose=None)

            return np.argmax(act_values[0])
         
    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            if done:
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model.predict(next_state, verbose=None)[0])

            train_target = self.model.predict(state, verbose=None)
            train_target[0][action] = target
            self.model.fit(state, train_target, verbose=None)

    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [3]:
from gymnasium.experimental.wrappers import RecordVideoV0
from IPython.display import Video

env = gym.make('CartPole-v1', render_mode='rgb_array')
agent = DQLAgent(env)

batch_size = 16
episodes = 20

env = RecordVideoV0(env, "video", name_prefix="cartpole-dqn", disable_logger=True,episode_trigger=lambda x: x )

#env.reset(seed=42)

for e in range(episodes):  
    state = env.reset()[0]
    state = np.reshape(state, [1,4])
    time = 0
    for i in range(200):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)[:4]
        next_state = np.reshape(next_state, [1,4])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.replay(batch_size)
        agent.adaptiveEGreedy()
        time += 1
        if i % 10 == 0:
            print(i,end=' ')
        if done:
            print('\nepisode: {}, time: {}'.format(e, time))
            break
env.close()

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


0 10 
episode: 0, time: 19
0 10 
episode: 1, time: 11
0 10 
episode: 2, time: 17
0 10 20 
episode: 3, time: 23
0 10 
episode: 4, time: 20
0 10 
episode: 5, time: 20
0 10 
episode: 6, time: 18
0 10 20 30 
episode: 7, time: 33
0 10 20 30 
episode: 8, time: 36
0 10 20 30 40 50 60 70 80 
episode: 9, time: 90
0 10 
episode: 10, time: 19
0 10 
episode: 11, time: 12
0 
episode: 12, time: 9
0 
episode: 13, time: 9
0 10 
episode: 14, time: 13
0 10 
episode: 15, time: 14
0 
episode: 16, time: 10
0 10 
episode: 17, time: 12
0 
episode: 18, time: 10
0 10 
episode: 19, time: 11


In [4]:
Video("video/cartpole-dqn-episode-1.mp4", embed=True)

In [5]:
Video("video/cartpole-dqn-episode-19.mp4", embed=True)