In [13]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import random
import math
import FluidEnv
%matplotlib inline

In [14]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fb574e31f90>

In [15]:
#Hyperparameters
learning_rate = 0.001
gamma = 1
num_params = 4
change = 0.05
is_ipython = True

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [18]:
class DQN(nn.Module):

    def __init__(self, outputs):
        super(DQN, self).__init__()
        self.lin1 = nn.Linear(outputs // 2, 16)
        self.lin2 = nn.Linear(16, 32)
        self.lin3 = nn.Linear(32, 32)
        self.head = nn.Linear(32, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        return self.head(x)

In [19]:
BATCH_SIZE = 64
GAMMA = 1
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 2


# Get number of actions from gym action space
n_actions = num_params * 2

policy_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0

actions = [
    [change, 0, 0, 0],
    [-change, 0, 0, 0],
    [0, change, 0, 0],
    [0, -change, 0, 0],
    [0, 0, change, 0],
    [0, 0, -change, 0],
    [0, 0, 0, change],
    [0, 0, 0, -change],
]

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return torch.tensor([[policy_net(state).argmax()]], device=device, dtype=torch.long)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [20]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    
    non_final_next_states = torch.cat([torch.Tensor(s) for s in batch.next_state
                                                if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)    
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [21]:
import importlib

In [None]:
importlib.reload(FluidEnv)

env = FluidEnv.FluidEnv(4, './data/sound.wav', 50)

num_episodes = 500
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    done = False
    t = 0
    while not done:
        # Select and perform an action
        action = select_action(state)
        next_state, reward, done, _ = env.step(torch.Tensor(actions[action.item()]))
        reward = torch.tensor([reward], device=device)        

        # Store the transition in memory
        memory.push(state.unsqueeze(0), action, next_state.unsqueeze(0), reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        t += 1
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    if i_episode % 1 == 0:
            print('Episode {}\tReward: {}\t State: {}'.format(i_episode, reward, state.tolist()))

print('Complete')
plt.ioff()
plt.show()

Episode 0	Reward: tensor([-4.2894])	 State: [0.9000000953674316, 0.7000000476837158, 0.44999998807907104, 0.19999995827674866]
Episode 1	Reward: tensor([-7.7767])	 State: [0.049999963492155075, 0.2999999523162842, 0.6500000357627869, 0.8500000834465027]
Episode 2	Reward: tensor([-9.1265])	 State: [0.05000000074505806, 0.34999996423721313, 0.6000000238418579, 1.0]
Episode 3	Reward: tensor([-3.2116])	 State: [0.2999999523162842, 0.0, 0.3999999761581421, 0.6500000357627869]
Episode 4	Reward: tensor([-2.8495])	 State: [0.2999999523162842, 0.05000000074505806, 0.34999996423721313, 0.5]
Episode 5	Reward: tensor([-1.4991])	 State: [0.34999996423721313, 0.6000000238418579, 0.3999999761581421, 0.05000000074505806]
Episode 6	Reward: tensor([-5.9192])	 State: [0.1499999612569809, 0.2999999523162842, 0.0, 0.550000011920929]
Episode 7	Reward: tensor([-0.5325])	 State: [0.44999998807907104, 0.6000000238418579, 0.0, 0.2999999523162842]
Episode 8	Reward: tensor([-1.0455])	 State: [0.3999999761581421, 

Episode 78	Reward: tensor([-0.0587])	 State: [0.44999998807907104, 0.5, 0.05000000074505806, 0.34999996423721313]
Episode 79	Reward: tensor([-0.4961])	 State: [0.3999999761581421, 0.6500000357627869, 0.0, 0.3999999761581421]
Episode 80	Reward: tensor([-0.5401])	 State: [0.44999998807907104, 0.6000000238418579, 0.0, 0.5]
Episode 81	Reward: tensor([-0.2119])	 State: [0.44999998807907104, 0.550000011920929, 0.0, 0.2999999523162842]
Episode 82	Reward: tensor([-0.0935])	 State: [0.44999998807907104, 0.44999998807907104, 0.0, 0.09999996423721313]
Episode 83	Reward: tensor([-0.0935])	 State: [0.44999998807907104, 0.44999998807907104, 0.0, 0.0]
Episode 84	Reward: tensor([-0.2119])	 State: [0.44999998807907104, 0.550000011920929, 0.0, 0.6500000357627869]
Episode 85	Reward: tensor([-0.0673])	 State: [0.44999998807907104, 0.5, 0.0, 0.2999999523162842]
Episode 86	Reward: tensor([-0.0673])	 State: [0.44999998807907104, 0.5, 0.0, 0.24999995529651642]
Episode 87	Reward: tensor([-0.4010])	 State: [0.4