# Naive Deep Q-learning

This notebook implements a deep Q-learning algorithm in combination with a neural network. The neural network's objective is to approximate the optimal Q-value for each state-action pair in our environment. 

We use a neural network with two linear layers that take a low-level representation of the environment as input and output the Q-values, corresponding to the actions an agent can take from that state. Adam will be used as an optimizer with mean square error as a loss function. The learning rate will be specified as 0.001. This naive deep Q-learning algorithm's overall goal is to serve as a benchmark for further improvement, and thus hyperparameter tuning will not be considered in this notebook.

## Import Libraries

In [14]:
import torch 
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T
import numpy as np
import gym

from ipynb.fs.full.Plotting import test
import matplotlib.pyplot as plt

ImportError: cannot import name 'test' from 'ipynb.fs.full.Plotting' (unknown location)

## Naive Deep Q-network

In [5]:
class LinearDeepQNetwork(nn.Module):
    def __init__(self, observation_space, action_space, learning_rate):
        super(LinearDeepQNetwork, self).__init__()
        
        self.fc1 = nn.Linear(*observation_space, 128)
        self.fc2 = nn.Linear(128, action_space)
        
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        self.to(self.device)
        
    def forward(self, state):
        layer1 = F.relu(self.fc1(state))
        actions = self.fc2(layer1)
        
        return actions  

# Reinforcement Learning Agent

In [6]:
class Agent():
    def __init__(self, observation_space, action_space, learning_rate = 0.001, discount_rate = 0.99, exploration_rate = 1.0,
                 max_exploration_rate = 1, min_exploration_rate = 0.01,  exploration_decay_rate = 0.001):
        
        self.observation_space = observation_space
        self.action_space = action_space
        
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        
        self.exploration_rate = exploration_rate
        self.max_exploration_rate = max_exploration_rate
        self.min_exploration_rate = min_exploration_rate
        self.exploration_decay_rate = exploration_decay_rate
        
        self.episode = 0
        self.Q = LinearDeepQNetwork(self.observation_space, self.action_space, self.learning_rate)
        
    def choice_action(self, state):
        exploration_rate_threshold = np.random.random()
        if exploration_rate_threshold > self.exploration_rate:
            state = T.tensor(state, dtype = T.float).to(self.Q.device)
            actions = self.Q.forward(state) # Action of state.
            action = T.argmax(actions).item() # Best action.
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def decrease_exploration_rate(self):
        self.episode += 1
        self.exploration_rate = self.min_exploration_rate + \
            (self.max_exploration_rate - self.min_exploration_rate) * np.exp(- self.exploration_decay_rate * self.episode)

    def learn(self, state, action, reward, next_state):
        # Set gradient to zero.
        self.Q.optimizer.zero_grad()
        
        # Convert to tensors.
        states = T.tensor(state, dtype = T.float).to(self.Q.device)
        actions = T.tensor(action).to(self.Q.device)
        rewards = T.tensor(reward).to(self.Q.device)
        next_states = T.tensor(next_state, dtype = T.float).to(self.Q.device)
        
        # Calculate loss using Bellman equation.
        q_pred = self.Q.forward(states)[actions]
        q_next = self.Q.forward(next_states).max()
        q_target = reward + self.exploration_rate * q_next
        loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
        
        # Optimizer in action.
        loss.backward()
        self.Q.optimizer.step()

## Main Program

In [7]:
episodes = 100
env = gym.make('CartPole-v1')
agent = Agent(env.observation_space.shape, env.action_space.n)

scores = []
average_scores = []
exploration_rate_history = []


for i in range(episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        action = agent.choice_action(state)
        next_state, reward, done, info = env.step(action)
        agent.learn(state, action, reward, next_state)
        
        state = next_state
        score += reward

    scores.append(score)
    agent.decrease_exploration_rate()
    
    exploration_rate_history.append(agent.exploration_rate)
    
    if i % 100 == 0:
        avg_score = np.mean(scores[-100:])
        average_scores.append(avg_score)
        print("Episode: ", i, "\t", "Average score: ", avg_score, "\t", "Exploration Rate: ", agent.exploration_rate)

plot(episodes, scores, average_scores, exploration_rate_history)

Episode:  0 	 Average score:  12.0 	 Exploration Rate:  0.9990104948350412


NameError: name 'plt' is not defined

In [11]:
test()

NameError: name 'test' is not defined