In [None]:
!pip install gymnasium

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gymnasium as gym
import random
import numpy as np 
import tensorflow as tf 
from collections import deque 
from keras.models import Sequential 
from keras.layers import Dense

In [None]:
EPISODES = 80

In [None]:
class DQNAgent: 
    def __init__(self, state_size, action_size): 
        self.state_size = state_size 
        self.action_size = action_size 
        self.memory = deque(maxlen=2000) 
        self.gamma = 0.95    # discount rate 
        self.epsilon = 1.0  # exploration rate 
        self.epsilon_min = 0.01 
        self.epsilon_decay = 0.995 
        self.learning_rate = 0.001 
        self.model = self._build_model() 
 
    def _build_model(self): 
        # Neural Net for Deep-Q learning Model 
        model = Sequential() 
        model.add(Dense(24, input_dim=self.state_size, activation='relu')) 
        model.add(Dense(24, activation='relu')) 
        model.add(Dense(self.action_size, activation='linear')) 
        model.compile(loss='mse', 
                      optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate)) 
        return model 
 
    def memorize(self, state, action, reward, next_state, done): 
        self.memory.append((state,action,reward,next_state,done)) 
 
    def act(self, state): 
        if np.random.rand() <= self.epsilon: 
            return random.randrange(self.action_size) 
        act_values = self.model.predict(state) 
        return np.argmax(act_values[0])  # returns action 
 
    def replay(self, batch_size):
      minibatch = random.sample(self.memory, batch_size)
      for state, action, reward, next_state, done in minibatch:
          target = reward
          if not done:
              target = (reward + self.gamma *np.amax(self.model.predict(next_state)[0]))
          target_f = self.model.predict(state)
          target_f[0][action] = target
          self.model.fit(state, target_f, epochs=1, verbose=0)
      if self.epsilon > self.epsilon_min:
          self.epsilon *= self.epsilon_decay
 
    def load(self, name): 
        self.model.load_weights(name) 
 
    def save(self, name): 
        self.model.save_weights(name)

In [None]:
env = gym.make('CartPole-v1',render_mode='rgb_array') 
state_size = env.observation_space.shape[0] 
action_size = env.action_space.n 
agent = DQNAgent(state_size, action_size) 
# agent.load("cartpole-dqn.h5") 
done = False 
batch_size = 8 

for e in range(EPISODES): 
    state = env.reset(seed=32)[0] 
    state = np.reshape(state, [1, state_size]) 
    term,trunk = False,False 
    for time in range(500): 
        if e > (EPISODES*0.75): 
            env.render() 
        action = agent.act(state)
        next_state, reward, term, trunk, _ = env.step(action)
        done =  term or trunk
        reward = reward if not done else -10 
        next_state = np.reshape(next_state, [1, state_size]) 
        agent.memorize(state, action, reward, next_state, done) 
        state = next_state 
        if done: 
            print("episode: {}/{},score:{}, e: {:.2}" 
                  .format(e, EPISODES,time, agent.epsilon)) 
            break 
        if len(agent.memory) > batch_size: 
            agent.replay(batch_size) 
    if e % 5 == 0: 
        agent.save("cartpole-dqn.h5")



episode: 0/80,score:11, e: 0.99
episode: 1/80,score:9, e: 0.94
episode: 2/80,score:19, e: 0.86
episode: 3/80,score:23, e: 0.76
episode: 4/80,score:17, e: 0.7
episode: 5/80,score:13, e: 0.66
episode: 6/80,score:14, e: 0.61
episode: 7/80,score:14, e: 0.57
episode: 8/80,score:15, e: 0.53
episode: 9/80,score:10, e: 0.5
episode: 10/80,score:13, e: 0.47
