In [258]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import FluidEnv
%matplotlib inline

In [259]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fcb86717fb0>

In [267]:
#Hyperparameters
learning_rate = 0.0005
gamma = 0.99
num_params = 4
max_change = 0.05

In [274]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = num_params
        self.action_space = num_params
        
        self.l1 = nn.Linear(self.state_space, 128, bias=False)
        self.l2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):            
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Sigmoid(),
        )
        return model(x)

In [275]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    action_space = policy(Variable(state))
    
    means, sigs = action_space, torch.exp(torch.Tensor(np.zeros(num_params) + 0.1))
    
    dists = torch.distributions.Normal(means, sigs)
    samples = dists.sample()
    
    # Add log probability of our chosen action to our history    
    policy.policy_history = torch.cat((policy.policy_history, torch.prod(dists.log_prob(samples)).reshape(1)))
        
    return samples * max_change

In [276]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data)
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor([]))
    policy.reward_episode= []

In [277]:
import importlib

In [278]:
importlib.reload(FluidEnv)

env = FluidEnv.FluidEnv(4, './data/sound.wav', 20)
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

def main(episodes):
    for episode in range(episodes):
        state = env.reset() # Reset environment and record the starting state
        done = False       
    
        while not done:
            action = select_action(state)            
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action.data)

            # Save reward
            policy.reward_episode.append(reward)
        
        update_policy()

        if episode % 1 == 0:
            print('Episode {}\tReward: {}\t State: {}'.format(episode, reward, state.tolist()))


In [1]:
episodes = 500
main(episodes)

NameError: name 'main' is not defined