In [21]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import random
import math
import FluidEnv
%matplotlib inline

In [22]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f8759931f90>

In [23]:
#Hyperparameters
learning_rate = 0.00005
gamma = 1
num_params = 6
change = 0.05
is_ipython = True

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [26]:
class DQN(nn.Module):

    def __init__(self, outputs):
        super(DQN, self).__init__()
        self.lin1 = nn.Linear(outputs // 2, 32)
        self.lin2 = nn.Linear(32, 64)
        self.lin3 = nn.Linear(64, 64)
        self.lin4 = nn.Linear(64, 64)
        self.lin5 = nn.Linear(64, 32)
        self.head = nn.Linear(32, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = F.relu(self.lin4(x))
        x = F.relu(self.lin5(x))
        return self.head(x)

In [27]:
BATCH_SIZE = 64
GAMMA = 1
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 2

# Get number of actions from gym action space
n_actions = num_params * 2 + 1

policy_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(200)

steps_done = 0

actions_add = [
    [change if j == i else 0 for j in range(num_params)] for i in range(num_params)
]
actions_sub = [
    [-change if j == i else 0 for j in range(num_params)] for i in range(num_params)
]
actions = actions_add + actions_sub + [[0 for j in range(num_params)]]

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return torch.tensor([[policy_net(state).argmax()]], device=device, dtype=torch.long)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [28]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    
    non_final_next_states = torch.cat([torch.Tensor(s) for s in batch.next_state
                                                if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)    
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [29]:
import importlib

In [30]:
importlib.reload(FluidEnv)

env = FluidEnv.FluidEnv(num_params, '', 60)

maxreward = -100000
maxstate = []
num_episodes = 500
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    done = False
    t = 0
    while not done:
        # Select and perform an action
        action = select_action(state)
        next_state, reward, done, _ = env.step(torch.Tensor(actions[action.item()]))
        
        if reward > maxreward:
            maxreward = reward
            maxstate = next_state
        
        reward = torch.tensor([reward], device=device)        
        # Store the transition in memory
        memory.push(state.unsqueeze(0), action, next_state.unsqueeze(0), reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        t += 1
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    if i_episode % 1 == 0:
        print('Episode {}\tReward: {}\t State: {}'.format(i_episode, reward, state.tolist()))

    if i_episode % 10 == 0:
        print('Episode {}\t Max Reward: {}\t Max State: {}'.format(i_episode, maxreward, maxstate.tolist()))

print('Complete', maxreward, maxstate.tolist())
plt.ioff()
plt.show()

Episode 0	Reward: tensor([-1053.5951])	 State: [0.6000000238418579, 0.25, 0.550000011920929, 0.5, 0.30000001192092896, 0.0]
Episode 0	 Max Reward: -359.64251708984375	 Max State: [0.5, 0.550000011920929, 0.6000000238418579, 0.5, 0.5, 0.44999998807907104]
Episode 1	Reward: tensor([-385.7376])	 State: [0.25, 1.0, 0.6499999761581421, 0.3499999940395355, 0.5, 0.6499999761581421]
Episode 2	Reward: tensor([-445.3192])	 State: [0.5, 0.6499999761581421, 0.6499999761581421, 0.0, 0.550000011920929, 0.4000000059604645]
Episode 3	Reward: tensor([-535.8611])	 State: [0.0, 0.25, 0.5, 0.0, 0.44999998807907104, 0.800000011920929]
Episode 4	Reward: tensor([-404.6412])	 State: [0.550000011920929, 0.6499999761581421, 1.0, 0.800000011920929, 0.44999998807907104, 0.44999998807907104]
Episode 5	Reward: tensor([-377.8961])	 State: [0.44999998807907104, 0.550000011920929, 0.5, 1.0, 0.44999998807907104, 0.5]
Episode 6	Reward: tensor([-416.3029])	 State: [0.15000000596046448, 0.6499999761581421, 0.4000000059604

Episode 61	Reward: tensor([-756.6718])	 State: [0.3499999940395355, 0.05000000074505806, 1.0, 0.5, 0.550000011920929, 0.0]
Episode 62	Reward: tensor([-509.9061])	 State: [0.6499999761581421, 0.25, 1.0, 0.30000001192092896, 0.5, 0.5]
Episode 63	Reward: tensor([-540.1023])	 State: [0.05000000074505806, 0.550000011920929, 0.6000000238418579, 0.0, 0.6000000238418579, 0.20000000298023224]
Episode 64	Reward: tensor([-563.8500])	 State: [0.5, 0.699999988079071, 0.699999988079071, 0.5, 0.800000011920929, 1.0]
Episode 65	Reward: tensor([-455.8900])	 State: [0.5, 0.550000011920929, 0.5, 0.5, 0.6000000238418579, 1.0]
Episode 66	Reward: tensor([-467.0141])	 State: [0.5, 0.8500000238418579, 0.5, 0.44999998807907104, 0.0, 1.0]
Episode 67	Reward: tensor([-615.2443])	 State: [1.0, 0.949999988079071, 0.05000000074505806, 0.949999988079071, 0.44999998807907104, 0.550000011920929]
Episode 68	Reward: tensor([-621.9124])	 State: [1.0, 0.8999999761581421, 0.550000011920929, 1.0, 0.5, 0.44999998807907104]
Ep

Episode 124	Reward: tensor([-428.1645])	 State: [0.15000000596046448, 0.5, 0.75, 1.0, 0.5, 1.0]
Episode 125	Reward: tensor([-625.4690])	 State: [1.0, 0.699999988079071, 0.6000000238418579, 1.0, 0.550000011920929, 0.5]
Episode 126	Reward: tensor([-720.5627])	 State: [1.0, 1.0, 0.5, 0.949999988079071, 0.8999999761581421, 0.5]
Episode 127	Reward: tensor([-578.8191])	 State: [0.6499999761581421, 0.30000001192092896, 0.0, 1.0, 0.4000000059604645, 0.5]
Episode 128	Reward: tensor([-385.3286])	 State: [0.5, 0.30000001192092896, 0.5, 1.0, 0.44999998807907104, 0.5]
Episode 129	Reward: tensor([-422.0763])	 State: [0.4000000059604645, 0.5, 0.800000011920929, 0.0, 0.44999998807907104, 0.5]
Episode 130	Reward: tensor([-505.7321])	 State: [0.6000000238418579, 0.75, 0.5, 0.0, 0.550000011920929, 0.4000000059604645]
Episode 130	 Max Reward: -288.9600830078125	 Max State: [0.4000000059604645, 0.6499999761581421, 1.0, 0.6000000238418579, 0.5, 0.550000011920929]
Episode 131	Reward: tensor([-437.3762])	 Sta

Episode 184	Reward: tensor([-435.4691])	 State: [0.3499999940395355, 0.5, 0.699999988079071, 0.0, 0.5, 0.550000011920929]
Episode 185	Reward: tensor([-562.0112])	 State: [0.10000000149011612, 0.5, 0.4000000059604645, 0.0, 0.30000001192092896, 0.5]
Episode 186	Reward: tensor([-704.2757])	 State: [0.949999988079071, 0.75, 1.0, 0.0, 0.6499999761581421, 0.5]
Episode 187	Reward: tensor([-560.4247])	 State: [0.6499999761581421, 0.550000011920929, 0.75, 1.0, 0.6499999761581421, 0.75]
Episode 188	Reward: tensor([-438.7585])	 State: [0.5, 0.5, 0.550000011920929, 1.0, 0.20000000298023224, 0.5]
Episode 189	Reward: tensor([-493.5969])	 State: [0.5, 0.5, 0.5, 1.0, 0.0, 0.550000011920929]
Episode 190	Reward: tensor([-406.2553])	 State: [0.4000000059604645, 0.5, 0.550000011920929, 1.0, 0.550000011920929, 0.25]
Episode 190	 Max Reward: -288.9600830078125	 Max State: [0.4000000059604645, 0.6499999761581421, 1.0, 0.6000000238418579, 0.5, 0.550000011920929]
Episode 191	Reward: tensor([-471.0737])	 State:

Episode 245	Reward: tensor([-472.1459])	 State: [0.44999998807907104, 0.5, 1.0, 0.5, 0.05000000074505806, 0.5]
Episode 246	Reward: tensor([-474.3829])	 State: [0.44999998807907104, 0.3499999940395355, 0.949999988079071, 0.5, 0.0, 0.30000001192092896]
Episode 247	Reward: tensor([-428.9323])	 State: [0.0, 0.6000000238418579, 1.0, 0.5, 0.6499999761581421, 0.44999998807907104]
Episode 248	Reward: tensor([-483.6745])	 State: [0.0, 0.5, 0.4000000059604645, 0.550000011920929, 0.5, 0.44999998807907104]
Episode 249	Reward: tensor([-566.1998])	 State: [0.10000000149011612, 0.6000000238418579, 0.0, 0.699999988079071, 0.5, 0.4000000059604645]
Episode 250	Reward: tensor([-463.4248])	 State: [0.550000011920929, 0.6000000238418579, 0.75, 0.6499999761581421, 0.0, 1.0]
Episode 250	 Max Reward: -288.9600830078125	 Max State: [0.4000000059604645, 0.6499999761581421, 1.0, 0.6000000238418579, 0.5, 0.550000011920929]
Episode 251	Reward: tensor([-520.1821])	 State: [0.10000000149011612, 0.20000000298023224, 

Episode 303	Reward: tensor([-400.4512])	 State: [0.5, 0.0, 0.30000001192092896, 0.550000011920929, 0.44999998807907104, 0.5]
Episode 304	Reward: tensor([-368.7222])	 State: [0.5, 0.44999998807907104, 0.5, 0.550000011920929, 0.550000011920929, 0.5]
Episode 305	Reward: tensor([-385.1554])	 State: [0.5, 0.4000000059604645, 0.5, 0.5, 0.550000011920929, 0.4000000059604645]
Episode 306	Reward: tensor([-500.7650])	 State: [0.5, 0.0, 0.3499999940395355, 0.699999988079071, 0.4000000059604645, 0.15000000596046448]
Episode 307	Reward: tensor([-583.6570])	 State: [0.800000011920929, 0.05000000074505806, 0.5, 0.0, 0.05000000074505806, 0.5]
Episode 308	Reward: tensor([-467.9384])	 State: [0.550000011920929, 0.0, 0.44999998807907104, 1.0, 0.6000000238418579, 0.5]
Episode 309	Reward: tensor([-401.4454])	 State: [0.5, 0.05000000074505806, 0.20000000298023224, 0.6499999761581421, 0.5, 0.5]
Episode 310	Reward: tensor([-381.8508])	 State: [0.5, 0.0, 0.5, 0.5, 0.44999998807907104, 0.5]
Episode 310	 Max Rew

KeyboardInterrupt: 

In [None]:
actions