In [1]:
import torch
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms
from torch.autograd import Variable

In [2]:
# Define replay buffer
class replay_buffer:
    def __init__(self, n_stored, S_size, n_phi):
        self.D = np.empty(n_stored, dtype='object')
        self.counter = 0
        self.max_capacity = n_stored
        self.S_size = S_size
        self.n_phi = n_phi
        
    def add_replay(self, replay):

        if (self.max_capacity>=self.counter):
            # Case where added there is room for replays
            self.D[self.counter] = replay
            self.counter += 1
        else:
            # Case where a replay has to be supstituted
            replace_index = np.random.randint(self.max_capacity)
            self.D[replace_index] = replay
    
    def return_batch(self, batch_size):
        # Get index for batch
        indexes = np.random.randint(self.counter, size=batch_size)
        batch = self.D[indexes]
        
        # Define arrays for return
        S_array = np.zeros((batch_size,)+ (self.n_phi,) + self.S_size)
        S_next_array = np.zeros((batch_size,) + (self.n_phi,) + self.S_size)
        a_array = np.zeros(batch_size)
        r_array = np.zeros(batch_size)
        
        # Fill the batch
        i = 0
        for S, a, r, S_next in batch:
            S_array[i] = S
            a_array[i] = a
            r_array[i] = r
            S_next_array[i] = S_next
            i += 1
            
        return S_array, a_array, r_array, S_next_array

In [21]:
# Prepare model
class model:
    def __init__(self, S_size, n_actions, n_phi):
        # The classification network based on the transformed (cropped) image
        filter1_size = 32
        filter2_size = 64
        filter3_size = 64
        
        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=filter1_size,
                               kernel_size=8,
                               stride=4,
                               padding=4)
        
        self.conv2 = nn.Conv2d(in_channels=filter1_size, 
                               out_channels=filter2_size,
                               kernel_size=4,
                               stride=2,
                               padding=2)
        
        self.conv3 = nn.Conv2d(in_channels=filter2_size, 
                               out_channels=filter3_size,
                               kernel_size=3,
                               stride=1,
                               padding=2)
        
        # fully connected output layers
        self.fc1 = nn.Linear(in_features=50176, 
                             out_features=n_actions)
        
    def forward(self, x):
        # Flatten input
        #x = x.astype("float")
        x = torch.from_numpy(x)
        #x = x.float()
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        
        x = x.view(-1)
        
        # Pass input
        x = F.relu(self.fc1(x))
        return x

# Define transformer of image
def phi_transformer(S, n_phi):
    pre_process = torchvision.transforms.Compose(
    [torchvision.transforms.ToPILImage(),
     torchvision.transforms.Grayscale(num_output_channels=1),
     torchvision.transforms.Resize([84,84]),
     torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize(mean=[0.], std=[1.])])

    lists = []
    for i in range(n_phi):
        lists.append(pre_process(S[i]))
        
    S = torch.stack(lists)
    return S.numpy()

In [22]:
env = gym.make('Riverraid-v0')
env.reset()
S_size = 4
n_phi = 4
n_actions = 2
buffer_size = 10**6
update_freq = 1000
Q_network = model(S_size, n_actions, n_phi)
Q_target = model(S_size, n_actions, n_phi)
buffer = replay_buffer(buffer_size, S_size, n_phi)
epsilon = 0.1
n_games = 100
gamma = 0.99
batch_size = 32

In [23]:
## Define some usefull values
a_space = env.action_space.n
S_space = env.observation_space.shape

In [24]:
### Prepare environments

for i in range(n_games):
    # Restart environment
    # Reset observation
    S = np.zeros((n_phi,) + S_space, dtype="uint8")
    S_next = np.zeros((n_phi,) + S_space, dtype="uint8")
    S[n_phi-1] = env.reset()
    done = False
    S = phi_transformer(S,n_phi) # Transform input
    
    while not done:
        ## Select random or non-random action
        if np.random.rand(1)[0]<epsilon:
            # Case ranom move selected
            a = np.random.randint(a_space)
        else:
            # Case non-random move
            with torch.no_grad():
                a = np.argmax(Q_network.forward(S).numpy())

        # Perform action for n_phi times
        r = 0
        for j in range(n_phi):
            S_next[j], r_temp, done, info = env.step(a)
            r += r_temp

            if (done): # Check if game done
                # Have to save process S_next
                S_next[(n_phi-j):n_phi] = S_next[0:j]
                S_next[0:(n_phi-j)] = S[j:n_phi]
                break
        S_next = phi_transformer(S_next, n_phi) # Transform input
        # Store data
        replay = [S, a, r, S_next]
        buffer.add_replay(replay)
        S = S_next # Switch state
        
        ## Perform update of Q_network
        [S_train, a_train, r_train, S_next_train] = buffer.return_batch(batch_size)
        y_target = r_train + np.argmax(Q_target.forward(S_next_train))
        y_train = Q_network.forward(S_train)
        batch_loss = criterion(y_target,y_train)
        batch_loss.backward()
        optimizer.step()
        
        # Update target_Q
        if (i % update_freq == 0):
            target_Q = Q_network.copy()
            target_Q.eval()

uint8
uint8


TypeError: can only concatenate tuple (not "int") to tuple

In [177]:
S, r_temp, done, info = env.step(2)

In [181]:
S

array([[[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [170, 170, 170],
        [170, 170, 170],
        [170, 170, 170]],

       ...,

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]]

In [15]:
S.astype("double")

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.03398085,  0.02914684,  0.01451927, -0.04932675]])

In [14]:
np.zeros((5))[np.random.randint(5,size=2)]

array([0., 0.])