In [1]:
import numpy as np
import random
from collections import deque
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import torch.nn as nn
import math
import copy

In [5]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [87]:
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
EPSILON = 1e-6

In [88]:
def weights_init_(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)

In [112]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(state_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, hidden_dim)
        self.linear4 = nn.Linear(hidden_dim, 1)
        
        self.apply(weights_init_)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = self.linear4(x)
        return x

In [113]:
class QNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(SoftQNetwork, self).__init__()
        
        # Q1
        self.linear1_q1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2_q1 = nn.Linear(hidden_size, hidden_size)
        self.linear3_q1 = nn.Linear(hidden_size, hidden_size)
        self.linear4_q1 = nn.Linear(hidden_size, 1)
        
        # Q1
        self.linear1_q2 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2_q2 = nn.Linear(hidden_size, hidden_size)
        self.linear3_q2 = nn.Linear(hidden_size, hidden_size)
        self.linear4_q2 = nn.Linear(hidden_size, 1)
        
        self.apply(weights_init_)
        
    def forward(self, state, action):
        x_state_action = torch.cat([state, action], 1)
        
        x1 = F.relu(self.linear1_q1(x_state_action))
        x1 = F.relu(self.linear2_q1(x1))
        x1 = F.relu(self.linear3_q1(x1))
        x1 = self.linear4(x1)
        
        x2 = F.relu(self.linear1_q2(x_state_action))
        x2 = F.relu(self.linear2_q2(x2))
        x2 = F.relu(self.linear3_q2(x2))
        x2 = self.linear4(x2)
        
        return x1, x2

In [115]:
for name, param in v.named_parameters():
    if param.requires_grad:
        print name, param.data

linear1.weight tensor([[-0.4039,  0.0567,  0.4509, -0.0458],
        [ 0.1921, -0.6469,  0.5294,  0.2056],
        [-0.5741, -0.5269,  0.5190,  0.2072],
        [ 0.4458, -0.4891, -0.4066, -0.3442]])
linear1.bias tensor([0., 0., 0., 0.])
linear2.weight tensor([[-0.8503, -0.6781, -0.2077,  0.5219],
        [ 0.7573,  0.7078,  0.6030,  0.5681],
        [-0.7756, -0.3552,  0.1104,  0.1408],
        [ 0.0894, -0.7623, -0.2496,  0.1368]])
linear2.bias tensor([0., 0., 0., 0.])
linear3.weight tensor([[-0.1031, -0.6663,  0.5182,  0.2299],
        [-0.3999,  0.8554, -0.0243,  0.7566],
        [-0.7987,  0.5930,  0.7130, -0.2767],
        [ 0.0521,  0.2824,  0.3026,  0.5067]])
linear3.bias tensor([0., 0., 0., 0.])
linear4.weight tensor([[ 0.3170,  0.9309, -0.0888, -0.1267]])
linear4.bias tensor([0.])
