In [24]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import random
import math
import FluidEnv
%matplotlib inline

In [25]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fcbd0e5af90>

In [26]:
#Hyperparameters
learning_rate = 0.00005
gamma = 1
num_params = 6
change = 0.05
is_ipython = True

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [29]:
class DQN(nn.Module):

    def __init__(self, outputs):
        super(DQN, self).__init__()
        self.lin1 = nn.Linear(outputs // 2, 32)
        self.lin2 = nn.Linear(32, 64)
        self.lin3 = nn.Linear(64, 64)
        self.lin4 = nn.Linear(64, 64)
        self.lin5 = nn.Linear(64, 32)
        self.head = nn.Linear(32, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = F.relu(self.lin4(x))
        x = F.relu(self.lin5(x))
        return self.head(x)

In [30]:
BATCH_SIZE = 64
GAMMA = 1
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 2

# Get number of actions from gym action space
n_actions = num_params * 2

policy_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(200)

steps_done = 0

actions_add = [
    [change if j == i else 0 for j in range(num_params)] for i in range(num_params)
]
actions_sub = [
    [-change if j == i else 0 for j in range(num_params)] for i in range(num_params)
]
actions = actions_add + actions_sub

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return torch.tensor([[policy_net(state).argmax()]], device=device, dtype=torch.long)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [31]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    
    non_final_next_states = torch.cat([torch.Tensor(s) for s in batch.next_state
                                                if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)    
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [32]:
import importlib

In [33]:
importlib.reload(FluidEnv)

env = FluidEnv.FluidEnv(num_params, '', 50)

maxreward = -100000
maxstate = []
num_episodes = 500
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    done = False
    t = 0
    while not done:
        # Select and perform an action
        action = select_action(state)
        next_state, reward, done, _ = env.step(torch.Tensor(actions[action.item()]))
        
        if reward > maxreward:
            maxreward = reward
            maxstate = next_state
        
        reward = torch.tensor([reward], device=device)        
        # Store the transition in memory
        memory.push(state.unsqueeze(0), action, next_state.unsqueeze(0), reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        t += 1
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    if i_episode % 1 == 0:
        print('Episode {}\tReward: {}\t State: {}'.format(i_episode, reward, state.tolist()))

    if i_episode % 10 == 0:
        print('Episode {}\t Max Reward: {}\t Max State: {}'.format(i_episode, maxreward, maxstate.tolist()))

print('Complete', maxreward, maxstate.tolist())
plt.ioff()
plt.show()

Episode 0	Reward: tensor([-75.6399])	 State: [1.0, 0.6499999761581421, 0.4000000059604645, 0.550000011920929, 0.4000000059604645, 0.20000000298023224]
Episode 0	 Max Reward: -41.174991607666016	 Max State: [1.0, 0.6499999761581421, 0.3499999940395355, 0.550000011920929, 0.4000000059604645, 0.25]
Episode 1	Reward: tensor([-1452.1062])	 State: [0.75, 0.44999998807907104, 0.6499999761581421, 0.4000000059604645, 0.550000011920929, 0.0]
Episode 2	Reward: tensor([-8.6203])	 State: [0.30000001192092896, 0.550000011920929, 0.5, 0.6000000238418579, 1.0, 0.44999998807907104]
Episode 3	Reward: tensor([-8.3850])	 State: [0.25, 0.6499999761581421, 0.44999998807907104, 0.5, 0.949999988079071, 0.5]
Episode 4	Reward: tensor([-14.7798])	 State: [0.5, 0.5, 0.5, 0.550000011920929, 1.0, 0.550000011920929]
Episode 5	Reward: tensor([-16.8419])	 State: [0.550000011920929, 0.5, 0.44999998807907104, 0.44999998807907104, 1.0, 1.0]
Episode 6	Reward: tensor([-5.6689])	 State: [0.25, 0.44999998807907104, 0.5500000

Episode 56	Reward: tensor([-3.4730])	 State: [0.0, 0.4000000059604645, 0.20000000298023224, 0.6000000238418579, 0.6000000238418579, 0.550000011920929]
Episode 57	Reward: tensor([-3.7665])	 State: [0.0, 0.3499999940395355, 0.3499999940395355, 0.800000011920929, 0.8999999761581421, 0.44999998807907104]
Episode 58	Reward: tensor([-18.7416])	 State: [0.0, 0.4000000059604645, 0.0, 0.699999988079071, 0.550000011920929, 0.30000001192092896]
Episode 59	Reward: tensor([-43.2105])	 State: [0.0, 0.6000000238418579, 0.05000000074505806, 0.550000011920929, 0.550000011920929, 0.800000011920929]
Episode 60	Reward: tensor([-6.0335])	 State: [0.0, 0.0, 0.20000000298023224, 0.8999999761581421, 0.699999988079071, 0.6000000238418579]
Episode 60	 Max Reward: -0.27109068632125854	 Max State: [0.20000000298023224, 0.30000001192092896, 0.44999998807907104, 0.550000011920929, 0.75, 0.5]
Episode 61	Reward: tensor([-1.9700])	 State: [0.0, 0.0, 0.30000001192092896, 0.5, 0.6499999761581421, 0.44999998807907104]
Ep

Episode 111	Reward: tensor([-26.8582])	 State: [0.5, 0.0, 0.8999999761581421, 0.699999988079071, 0.8999999761581421, 0.6000000238418579]
Episode 112	Reward: tensor([-2.1681])	 State: [0.15000000596046448, 0.0, 0.550000011920929, 0.550000011920929, 0.75, 0.8500000238418579]
Episode 113	Reward: tensor([-6.9667])	 State: [0.4000000059604645, 0.10000000149011612, 0.44999998807907104, 0.75, 0.800000011920929, 1.0]
Episode 114	Reward: tensor([-2.4960])	 State: [0.0, 0.10000000149011612, 0.550000011920929, 0.6000000238418579, 0.800000011920929, 0.75]
Episode 115	Reward: tensor([-8.2919])	 State: [0.0, 0.4000000059604645, 0.6499999761581421, 0.800000011920929, 0.8999999761581421, 1.0]
Episode 116	Reward: tensor([-3.1561])	 State: [0.0, 0.10000000149011612, 0.44999998807907104, 0.699999988079071, 0.8500000238418579, 0.6499999761581421]
Episode 117	Reward: tensor([-1.9109])	 State: [0.0, 0.3499999940395355, 0.44999998807907104, 0.6499999761581421, 0.699999988079071, 0.4000000059604645]
Episode 1

Episode 164	Reward: tensor([-10.1474])	 State: [0.30000001192092896, 0.6000000238418579, 0.15000000596046448, 0.3499999940395355, 0.6499999761581421, 0.4000000059604645]
Episode 165	Reward: tensor([-8.7133])	 State: [0.3499999940395355, 0.5, 0.30000001192092896, 0.5, 0.949999988079071, 0.6499999761581421]
Episode 166	Reward: tensor([-3.6708])	 State: [0.0, 0.3499999940395355, 0.550000011920929, 0.30000001192092896, 0.949999988079071, 0.4000000059604645]
Episode 167	Reward: tensor([-17.8117])	 State: [0.5, 0.699999988079071, 0.0, 0.30000001192092896, 0.699999988079071, 0.6000000238418579]
Episode 168	Reward: tensor([-17.3078])	 State: [0.550000011920929, 0.0, 0.800000011920929, 0.25, 0.8999999761581421, 0.44999998807907104]
Episode 169	Reward: tensor([-8.5847])	 State: [0.44999998807907104, 0.20000000298023224, 0.4000000059604645, 0.699999988079071, 0.800000011920929, 0.699999988079071]
Episode 170	Reward: tensor([-6.6337])	 State: [0.10000000149011612, 0.0, 0.20000000298023224, 0.44999

Episode 216	Reward: tensor([-5.9115])	 State: [0.0, 0.44999998807907104, 0.30000001192092896, 0.550000011920929, 0.8500000238418579, 0.20000000298023224]
Episode 217	Reward: tensor([-10.4328])	 State: [0.15000000596046448, 0.20000000298023224, 0.10000000149011612, 0.10000000149011612, 0.5, 0.30000001192092896]
Episode 218	Reward: tensor([-3.9747])	 State: [0.0, 0.05000000074505806, 0.6000000238418579, 0.4000000059604645, 0.699999988079071, 0.4000000059604645]
Episode 219	Reward: tensor([-6.5015])	 State: [0.25, 0.6499999761581421, 0.0, 0.6000000238418579, 0.699999988079071, 0.3499999940395355]
Episode 220	Reward: tensor([-5.7183])	 State: [0.0, 0.550000011920929, 0.30000001192092896, 0.550000011920929, 0.6499999761581421, 0.30000001192092896]
Episode 220	 Max Reward: -5.650676030199975e-06	 Max State: [0.20000000298023224, 0.30000001192092896, 0.4000000059604645, 0.5, 0.699999988079071, 0.6000000238418579]
Episode 221	Reward: tensor([-22.2574])	 State: [0.3499999940395355, 0.4499999880

KeyboardInterrupt: 

In [23]:
actions

[[0.05, 0, 0, 0, 0, 0],
 [0, 0.05, 0, 0, 0, 0],
 [0, 0, 0.05, 0, 0, 0],
 [0, 0, 0, 0.05, 0, 0],
 [0, 0, 0, 0, 0.05, 0],
 [0, 0, 0, 0, 0, 0.05],
 [-0.05, 0, 0, 0, 0, 0],
 [0, -0.05, 0, 0, 0, 0],
 [0, 0, -0.05, 0, 0, 0],
 [0, 0, 0, -0.05, 0, 0],
 [0, 0, 0, 0, -0.05, 0],
 [0, 0, 0, 0, 0, -0.05]]