In [1]:
import random
from collections import deque

import gym
import tensorflow as tf
import os
import numpy as np
import pygame



In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = '0'

In [3]:
opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.0001)

random.seed(100)
tf.random.set_seed(100)

In [4]:
class Agent:
    def __init__(self):
        self.epsilon = 1.0  # exploration percentage
        self.epsilon_decay = 0.9975  # exploration decay
        self.epsilon_min = 1  # exploration min
        self.memory = deque(maxlen=2000)  # previous memory
        self.batch_size = 32  # training batch
        self.gamma = 0.9  # future reward discount (adding future prediction's reward to curr reward with some grain of salt)
        self.treward = []  # list of rewards
        self.max_treward = 0
        # self.env = gym.make("CartPole-v1", render_mode="human")
        self.env = gym.make("CartPole-v1")
        self.model = self._create_model()

    def _create_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, activation="relu", input_shape=(4,)))
        model.add(tf.keras.layers.Dense(24, activation="relu"))
        model.add(tf.keras.layers.Dense(2, activation="linear"))
        model.compile(loss="mse", optimizer=opt)
        return model

    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state.reshape(1,4))[0], verbose=0)

    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, next_state, reward, done in batch:
            if not done:
                reward += self.gamma * np.amax(self.model.predict(next_state.reshape(1,4), verbose=0)[0])
            target = self.model.predict(state.reshape(1,4), verbose=0)
            target[0, action] = reward
            self.model.fit(state.reshape(1,4), target, epochs=2, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def train(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            for f in range(1, 5000):
                action = self.act(state)
                next_state, reward, done, trunc, _ = self.env.step(action)
                self.memory.append([state, action, next_state, reward, done])
                state = next_state
                if done or trunc:
                    self.treward.append(f)
                    self.max_treward = max(self.max_treward, f)
                    print(f"max reward: {self.max_treward}, current frame reached: {f}")
                    break
            if len(self.memory) > self.batch_size:
                self.replay()

    def test(self, episodes):
        for e in range(1, episodes + 1):
            state, _ = self.env.reset()
            for f in range(1, 5000):
                action = np.argmax(self.model.predict(state.reshape(1,4))[0])
                next_state, reward, done, trunc, _ = self.env.step(action)
                state = next_state
                if done or trunc:
                    self.treward.append(f)
                    self.max_treward = max(self.max_treward, f)
                    print(f"max reward: {self.max_treward}")
                    break

In [8]:
agent = Agent()

In [9]:
agent.train(1500)

max reward: 28, current frame reached: 28
max reward: 28, current frame reached: 10
max reward: 28, current frame reached: 10
max reward: 28, current frame reached: 12
max reward: 29, current frame reached: 29
max reward: 29, current frame reached: 25
max reward: 63, current frame reached: 63
max reward: 63, current frame reached: 28
max reward: 63, current frame reached: 15
max reward: 63, current frame reached: 11
max reward: 63, current frame reached: 20
max reward: 63, current frame reached: 13
max reward: 63, current frame reached: 26
max reward: 63, current frame reached: 23
max reward: 63, current frame reached: 45
max reward: 63, current frame reached: 36
max reward: 63, current frame reached: 21
max reward: 63, current frame reached: 18
max reward: 63, current frame reached: 35
max reward: 63, current frame reached: 11
max reward: 63, current frame reached: 28
max reward: 63, current frame reached: 14
max reward: 63, current frame reached: 15
max reward: 63, current frame reac

KeyboardInterrupt: 