# Human-level Control through Deep Reinforcement Learning

## Import Libraries

In [2]:
import torch 
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T
import gym

import collections
import cv2

import matplotlib.pyplot as plt
import numpy as np

# Preprocessing of Images
- Go from three channels to one channel
    - Screen images have three channels, while our agent only needs one channel. We convert the image to grayscale.
- Downscale to 84 x 84
    - Images are realy large, with makes training slow. I am resizing the image to 84 x 84 to improve learning.
- Take max of previous frames
    - We keep track of the two most recent frames and taking the max over the two. 
- Repeat action 4 times
    - We repeat the same action four times for every skipped frame.
- Scale output
    - Scale output, since they are integers from 0 to 255. We can deal with this by dividing the image by 255. 

In [None]:
# https://github.com/openai/gym/tree/master/gym/wrappers
# Class for repeating action and taking max frame over the previous two frames.

class RepeatActionAndMaxFrame(gym.Wrapper):
    def __init__(self, env = None, repeat = 4, clip_reward = False, no_ops = 0, fire_fist = False):
        super(RepeatActionAndMaxFrame, self).__init__(env)
        
        self.repeat = repeat
        self.shape = env.observation_space.low.shape
        self.frame_buffer = np.zeros_like((2, self.shape))
        self.clip_reward = clip_reward
        self.no_ops = no_ops
        self.fire_fist = fire_fist
        
    def step(self, action):
        t_reward = 0.0
        done = False
        
        for i in range(self.repeat):
            obs, reward, done, info = self.env.step(action)
            
            if self.clip_reward:
                reward = np.clip(np.array([reward]), -1, 1)[0]
            
            t_reward += reward
            
            idx = i % 2
            self.frame_buffer[idx] = obs
            
            if done:
                break
                
        max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
        return max_frame, t_reward, done, info
    
    def rest(self):
        obs = self.env.reset()
        no_obs = np.randint(self.no_ops) + 1 if self.no_ops > 0 else 0
        
        for _ in range(no_ops):
            _, _, done, _ = self.env.step(0)
            if done:
                self.env.reset()
        
        if self.fire_first:
            assert self.env.unwrapper.get_action_meaning()[1] == 'FIRE'
            obs, _, _, _ = self.env.step(1)
        
        self.frame_buffer = np.zeros_like((2, self.shape))
        self.frame_buffer[0] = obs
        
        return obs

In [25]:
class preprocessFrame(gym.ObservationWrapper):
    def __init__(self, shape, env = None):
        super(preprocessFrame, self).__init__(env)
        
        self.shape = (shape[2], shape[0], shape[1]) # Order in with Pytorch accepts parameters.
        self.observation_space = gym.spaces.Box(low = 0.0, high = 1.0, shape = self.shape, dtype = np.float32) # Scale frame.
        
    def observation(self, obs):
        new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        resized_screen = cv2.resize(new_frame, self.shape[1:], interpolation = cv2.INTER_AREA)
        
        #
        new_obs = np.array(resized_screen, dtype = np.uint8).reshape(self.shape)
        new_obs = new_obs / 255.0
        
        return new_obs

In [26]:
class StackFrames(gym.ObservationWrapper):
    def __init__(self, env, repeat):
        super(StackFrames, self).__init__(env)
        
        self.observation_space = gym.spaces.Box(env.observation_space.low.repeat(repeat, axis = 0),
                                               env.observation_space.high.repeat(repeat, axis = 0),
                                               dtype = np.float32)
        self.stack = collection.deque(maxlen = repeat)
        
    def reset(self):
        self.stack.clear()
        observation = self.env.reset()
            
        for _ in range(self.stack.maxlen):
            self.stack.append(observation)
                
        return np.array(self.stack).reshape(self.observation_space.low.shape)
    
    def observation(self, observation):
        self.stack.append(observation)
        
        return np.array(self.stack).reshape(self.observation_space.low.shape)

In [27]:
def make_env(env_name, shape = (84, 84, 1), repeat = 4, clip_rewards = False, 
             no_ops = 0, fire_first = False):
    
    gym.make(env_name)
    env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
    env = preprocessFrame(shape, env)
    env = StackFrames(env, repeat)
    
    return env