# Exercise 5

## Reinforcement Learning


---

## Overview

Welcome to this Excercise. We are now going to use our new skills to build our first Deep Learning Reinforcement Learning Model. 




In [5]:

import random
import numpy as np

from collections import defaultdict

import gymnasium as gym
#from gymnasium import spaces
#from gymnasium import Env

class SimpleGridEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, grid_size=5):
        super(SimpleGridEnv, self).__init__()
        self.grid_size = grid_size
        self.action_space = gym.spaces.Discrete(4)  # 4 actions: up, down, left, right
        self.observation_space = gym.spaces.MultiDiscrete([grid_size, grid_size])
        self.state = None
        self.goal = (grid_size - 1, grid_size - 1)
    
    def reset(self):
        self.state = (0, 0)
        return np.array(self.state, dtype=np.int32)
    
    def step(self, action):
        x, y = self.state
        
        if action == 0:  # up
            x = max(0, x - 1)
        elif action == 1:  # down
            x = min(self.grid_size - 1, x + 1)
        elif action == 2:  # left
            y = max(0, y - 1)
        elif action == 3:  # right
            y = min(self.grid_size - 1, y + 1)
        
        self.state = (x, y)
        
        done = self.state == self.goal
        reward = 1 if done else -0.1
        
        return np.array(self.state, dtype=np.int32), reward, done, {}
    
    def render(self, mode='human'):
        grid = np.zeros((self.grid_size, self.grid_size), dtype=str)
        grid[:] = '.'
        grid[self.goal] = 'G'
        x, y = self.state
        grid[x, y] = 'A'
        print("\n".join(["".join(row) for row in grid]))
        print()

class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, epsilon=0.1):
        self.env = env
        self.q_table = defaultdict(lambda: np.zeros(env.action_space.n))
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
    
    def choose_action(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.q_table[state])
    
    def learn(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.discount_factor * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.learning_rate * td_error

def train_agent(env, agent, episodes=1000):
    for episode in range(episodes):
        state = tuple(env.reset())
        done = False
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            next_state = tuple(next_state)
            agent.learn(state, action, reward, next_state)
            state = next_state
        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1} completed")


if __name__ == "__main__":
    
    env = SimpleGridEnv()
    agent = QLearningAgent(env)
    train_agent(env, agent, episodes=1000)

    state = tuple(env.reset())
    done = False
    while not done:
        action = agent.choose_action(state)
        state, reward, done, _ = env.step(action)
        state = tuple(state)
        env.render()


Episode 100 completed
Episode 200 completed
Episode 300 completed
Episode 400 completed
Episode 500 completed
Episode 600 completed
Episode 700 completed
Episode 800 completed
Episode 900 completed
Episode 1000 completed
.A...
.....
.....
.....
....G

..A..
.....
.....
.....
....G

.....
..A..
.....
.....
....G

.....
.....
..A..
.....
....G

.....
.....
...A.
.....
....G

.....
.....
.....
...A.
....G

.....
.....
.....
....A
....G

.....
.....
.....
.....
....A



In [None]:
class CustomEnv(gym.Env):
    def __init__(self, df):
        super(CustomEnv, self).__init__()
        self.df = df
        self.action_space = gym.spaces.Discrete(1)  # Action space (predict F_1_d_returns)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(2,), dtype=np.float32)  # State space (1_d_returns, 2_d_returns)
        self.current_step = 0

    def reset(self):
        # Reset the environment to initial state
        self.current_step = 0
        self.state = self.df.iloc[self.current_step, 1:3].values  # Start with first row's 1_d_returns and 2_d_returns
        return self.state

    def step(self, action):
        # Take an action (not relevant here as we are predicting)
        self.current_step += 1
        done = self.current_step >= len(self.df) - 1
        if done:
            next_state = self.state
        else:
            next_state = self.df.iloc[self.current_step, 1:3].values
        reward = 0  # No reward for predicting
        info = {}   # Additional information (if needed)
        return next_state, reward, done, info


In [None]:
env = CustomEnv(df)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
#from tensorflow.keras.optimizers import Adam

In [None]:
states = env.observation_space.shape
actions = env.action_space.n

In [None]:
states

In [None]:
# Define your custom model
def build_model(input_shape, nb_actions):
    model = Sequential()
    model.add(Flatten(input_shape=input_shape))  # Adjust input shape here
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    return model


In [None]:
del model 

In [None]:
model = build_model(states, actions)

In [None]:
model.summary()

In [None]:
import tensorflow as tf
from keras import __version__
tf.keras.__version__ = __version__
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.optimizers.legacy import Adam


In [None]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [None]:

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

In [None]:
dqn.summary

In [20]:
import numpy as np
import pandas as pd
import gym
import dopamine
import logging  # Add this line
from dopamine.agents.dqn import dqn_agent
from dopamine.replay_memory import circular_replay_buffer
from dopamine.colab import utils as colab_utils
import tensorflow as tf
import os

# Create your environment
class CustomEnv(gym.Env):
    def __init__(self, df):
        super(CustomEnv, self).__init__()
        self.df = df
        self.action_space = gym.spaces.Discrete(1)  # Action space (predict F_1_d_returns)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(2,), dtype=np.float32)  # State space (1_d_returns, 2_d_returns)
        self.current_step = 0

    def reset(self):
        # Reset the environment to initial state
        self.current_step = 0
        self.state = self.df.iloc[self.current_step, 1:3].values  # Start with first row's 1_d_returns and 2_d_returns
        return self.state

    def step(self, action):
        # Take an action (not relevant here as we are predicting)
        self.current_step += 1
        done = self.current_step >= len(self.df) - 1
        if done:
            next_state = self.state
        else:
            next_state = self.df.iloc[self.current_step, 1:3].values
        reward = 0  # No reward for predicting
        info = {}   # Additional information (if needed)
        return next_state, reward, done, info

# Create your environment

df = pd.DataFrame({
    'Target_Returns': [-0.038076, 0.083333, 0.060577, -0.013599, -0.020221],
    '1_d_returns': [-0.062030, -0.038076, 0.083333, 0.060577, -0.013599],
    '2_d_returns': [-0.133681, -0.097744, 0.042084, 0.148958, 0.046154]
})

env = CustomEnv(df)

# Set up logging
LOG_PATH = '/tmp/dopamine/logs'
logging.basicConfig(level=logging.INFO)

# Create a TensorFlow session
tf.compat.v1.reset_default_graph()
sess = tf.compat.v1.Session()

# Set up a replay buffer


# Set up a replay buffer with increased capacity
replay_buffer = circular_replay_buffer.WrappedReplayBuffer(
    observation_shape=(1,) + env.observation_space.shape,
    stack_size=1,
    replay_capacity=100000)  # Increased capacity


# Create the agent
# Create the agent with decreased min replay history
# Create the agent with a higher min replay history
agent = dqn_agent.DQNAgent(
    sess,
    num_actions=env.action_space.n,
    observation_shape=(1,) + env.observation_space.shape,
    observation_dtype=tf.float32,
    stack_size=1,
    network='dqn',
    gamma=0.99,
    update_horizon=1,
    min_replay_history=1000,  # Increase min replay history
    update_period=4,
    target_update_period=100,
    epsilon_fn=lambda x: 0.1)

# Create a checkpoint directory
checkpoint_dir = os.path.join(LOG_PATH, 'checkpoints')
checkpoint_file_prefix = os.path.join(checkpoint_dir, 'ckpt')
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Create a logger
logger = colab_utils.Logger(LOG_PATH)

# Initialize variables
sess.run(tf.compat.v1.global_variables_initializer())

# Train the agent
for episode in range(100):
    obs = env.reset()
    done = False
    while not done:
        action = agent.begin_episode(obs)
        next_obs, reward, done, _ = env.step(action)
        agent.end_episode(reward)
        replay_buffer.add(obs, action, reward, next_obs, done)
        obs = next_obs

        if len(replay_buffer) >= agent.min_replay_history:
            experience = replay_buffer.sample(1)
            agent.step(experience)

    if episode % 10 == 0:
        logger.scalar_summary('Return', reward, step=episode)

# Save the final checkpoint
checkpoint_path = agent._saver.save(sess, checkpoint_file_prefix)
print('Final checkpoint saved at: %s' % checkpoint_path)


INFO:absl:Creating a OutOfGraphReplayBuffer replay memory with the following parameters:
INFO:absl:	 observation_shape: (1, 2)
INFO:absl:	 observation_dtype: <class 'numpy.uint8'>
INFO:absl:	 terminal_dtype: <class 'numpy.uint8'>
INFO:absl:	 stack_size: 1
INFO:absl:	 replay_capacity: 100000
INFO:absl:	 batch_size: 32
INFO:absl:	 update_horizon: 1
INFO:absl:	 gamma: 0.990000
INFO:absl:	 checkpoint_duration: 4
INFO:absl:	 keep_every: None


RuntimeError: Cannot sample a batch with fewer than stack size (1) + update_horizon (1) transitions.
  In call to configurable 'WrappedReplayBuffer' (<class 'dopamine.replay_memory.circular_replay_buffer.WrappedReplayBuffer'>)

In [None]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

In [None]:

# Define the model
input_shape = env.observation_space.shape[0]
nb_actions = env.action_space.n

In [None]:

model = build_model(input_shape, nb_actions)






In [None]:
# Define the memory
memory = SequentialMemory(limit=10000, window_length=1)

# Define the policy
policy = BoltzmannQPolicy()

In [None]:
# Create the DQN agent
dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=nb_actions,
               nb_steps_warmup=100, target_model_update=1e-2)

In [None]:
from tensorflow.keras.optimizers import Adam
# Instantiate the optimizer
optimizer = Adam(learning_rate=0.001)  # Adjust learning rate as needed

In [None]:
optimizer._name

In [None]:
# Compile the model
dqn.compile(optimizer=optimizer, metrics=['mae'])

In [None]:
# Compile the model
dqn.compile(optimizer='adam', metrics=['mae'])

In [None]:



# Train the agent
dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)

# Predict using the trained agent
obs = env.reset()  # Reset the environment
for _ in range(len(df) - 1):
    action = dqn.forward(obs)
    obs, rewards, dones, info = env.step(action)
    # Here, obs contains the predicted F_1_d_returns for each step


In [29]:
!python –V

python: can't open file 'C:\\Users\\cramk\\Documents\\Metin\\building-a-workflow-for-aI\\l5-reinforcement-learning\\–V': [Errno 2] No such file or directory


In [31]:
import keras
print(keras.__version__)

2.13.1
