In [1]:
import gym
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch
from torch.distributions import Categorical
from comet_ml import Experiment
%matplotlib inline
import matplotlib.pyplot as plt
import models
from itertools import count
import time

In [2]:
hyper_dict = {"LEARNING_RATE": 1e-5,
              "GAMMA": 0.99,
              "N_UPDATES": 100000,
              "GAMES_PER_UPDATE": 3,
              "SAVE_EVERY_UPATE": 25,
              "SAVE_FILE": "./models/pong_policy_9.state",
              "RESUME_FILE": "./models/pong_policy_8.state",
              "RESUME": True,
              "RENDER": True,
              "SLOW_DOWN": 0.02,
              "PROJECT_NAME": "pong-policy",
              "ENV": "Pong-v0"}

env = gym.make(hyper_dict['ENV'])

nnpolicy = models.ConvNet(env.action_space.n).cuda()
if hyper_dict["RESUME"]:
    nnpolicy.load_state_dict(torch.load(hyper_dict["RESUME_FILE"]))

def prepro(I):
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.expand_dims(I.astype(np.float), axis=0)


def get_outputs(obs):
    return nnpolicy(Variable(torch.from_numpy(obs).float().unsqueeze(0).cuda()))

In [3]:
experiment = Experiment(api_key="dZm2UV8sODS5eDYysEf8TzKNu", 
                        project_name=hyper_dict['PROJECT_NAME'])
experiment.log_multiple_params(hyper_dict)

jupyter comet_ml enable
COMET INFO: old comet version (1.0.9) detected. current: 1.0.13 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/syrios/pong-policy/0c6d2a29788e423ba12a52609533f5a0



In [4]:
def discount_rewards(rewards, discount_rate):
    """
    Discount a list of rewards by GAMMA
    Where now the last value in the list is just the reward for that value
    Since no more moves happened
    And the first gets added upon by the future discounted
    Thus have the discounted reward value for each action during the game
    """
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + discount_rate * cumulative_rewards
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(rewards, discount_rate):
    """
    Discount a list of list of rewards and normalize
    """
    all_discounted_rewards = [discount_rewards(reward, discount_rate) 
                              for reward in rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/(reward_std + np.finfo(np.float32).eps)
            for discounted_rewards in all_discounted_rewards]

def update_model(all_rewards, all_gradients, discount_rate):
    """
    Get discounted and normalized rewards for each move from each game
    Loop over every game and take the sum of r * -1 * gradient to understand
    How each gradient should be updated based on that game
    Then average over your batch
    """
    # get discounted rewards -> discount rewards which happened 
    # later more and normalize
    loss = []
    all_rewards = discount_and_normalize_rewards(all_rewards,discount_rate)
    # for every game played, sum up the total losses where you take the loss 
    # (*-1 since log_prob is ASCENT)
    # and multiply by the discounted, normalized reward. Thus, actions that lead 
    # to good rewards are applied
    # And actions that lead to be rewards (negative) are inversely applied.
    # Thus, that sum tells you how much to update based on that game
    for step in range(len(all_rewards)):
        r = torch.Tensor(all_rewards[step]).cuda()
        step_loss = []
        for value in range(len(all_rewards[step])):
            step_loss.append(r[value] * all_gradients[step][value] * -1)
        loss.append(sum(step_loss))
    loss = torch.cat(loss)
    optimizer.zero_grad()
    # Take the mean of all games losses to help smooth out the learning (mini-batch)
    policy_loss = loss.mean()
    policy_loss.backward()
    optimizer.step()
    
    
def getGradients(n_iters, n_games_per_gradient, discount_rate):
    """
    For each n_iters:
        Play n_games_per_gradients and use the policy to probabilistically
        Move based output probabilities
        Save gradients and rewards
        After playing the batch of games, update gradients
        
    """
    running_reward = None
    reward_sum = 0
    for iteration in range(n_iters):
        all_rewards = []
        all_gradients = []
        for game in range(n_games_per_gradient):
            current_rewards = []
            current_gradients = []

            obs = env.reset()
            prev_x = None
            for step in count():
                if hyper_dict['RENDER']:
                    env.render()
                # get the predicted action probabilities from our nn policy
                cur_x = prepro(obs)
                x = cur_x - prev_x if prev_x is not None else np.zeros((1,80,80))
                prev_x = cur_x
                outputs = get_outputs(x)
                #select an action with these probabilities
                categorical_distribution = Categorical(outputs)
                seleted_action = categorical_distribution.sample()
                #save the loss function
                current_gradients.append(
                    categorical_distribution.log_prob(seleted_action))
                #apply the action
                action = seleted_action.item()
                #save the reward
                obs, reward, done, _ = env.step(action)
                reward_sum += reward
                current_rewards.append(reward)
                if hyper_dict["SLOW_DOWN"] is not None:
                    time.sleep(hyper_dict["SLOW_DOWN"]) 
                if done:
                    running_reward = reward_sum if running_reward is None \
                        else running_reward * 0.99 + reward_sum * 0.01
                    experiment.log_metric("reward sum", reward_sum, 
                                          step=game + iteration * n_games_per_gradient)
                    reward_sum = 0
                    experiment.log_metric("reward mean", 
                                          running_reward,
                                          step=game + iteration * n_games_per_gradient)
                    break
            all_rewards.append(current_rewards)
            all_gradients.append(current_gradients)
        # apply saved loss functions
        update_model(all_rewards, all_gradients, discount_rate)
        if iteration % hyper_dict["SAVE_EVERY_UPATE"] == 0:
            print("Iteration: {}".format(iteration))
            print("Saving Model...")
            torch.save(nnpolicy.state_dict(), hyper_dict["SAVE_FILE"])


In [None]:
optimizer = optim.Adam(nnpolicy.parameters(), lr=hyper_dict['LEARNING_RATE'])

In [None]:
getGradients(n_iters = hyper_dict['N_UPDATES'],
                       n_games_per_gradient = hyper_dict['GAMES_PER_UPDATE'],
                       discount_rate = hyper_dict['GAMMA'])