# Train / test networks with Gym model with additional steps

use trained policy (trained with original weight = 10 kg) to run pendulum model with weight = 5 kg

In [None]:
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from torch.distributions import MultivariateNormal

import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
import gym
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import random


# Policy

In [None]:
class Network(nn.Module):
	def __init__(self, size_in, size_out,size_hidden):
		super(Network, self).__init__()
		self.layer1 = nn.Linear(size_in, size_hidden)
		self.layer2 = nn.Linear(size_hidden, size_hidden)
		self.layer3 = nn.Linear(size_hidden, size_out)

	def forward(self, obs):
		# Convert observation to tensor if it's a numpy array
		if isinstance(obs, np.ndarray):
			obs = torch.tensor(obs, dtype=torch.float)
		m = nn.Tanh()
		activation1 = F.relu(self.layer1(obs.float()))
		activation2 = F.relu(self.layer2(activation1))
		output = self.layer3(activation2)
		output = 2 * m(output)
		return output.float()


# PPO Algo

In [None]:
def PPO_gen(seed = 0, profix = "", timesteps_per_batch = 5000, name = "", m = None):
    ''' hyperparameter '''
    # collect data
    timesteps_per_batch = timesteps_per_batch                   # Number of timesteps to run per batch, episode
    max_timesteps_per_episode = 200                             # Max number of timesteps per episode, steps

    total_timesteps = 200                                       # collect how many times
    n_updates_per_iteration = 1                                 # Number of times to update actor/critic per iteration
    

    lr = 0.005                                 # Learning rate of actor optimizer
    gamma = 0.95                               # Discount factor to be applied when calculating Rewards-To-Go
    clip = 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
    
    env = gym.make("Pendulum-v1")
    if m:   # m is the weight in model
        env.m = m

    filename = name + f"_{timesteps_per_batch}_gen"
    if m:
        filename += f"_m{m}"

    np.random.seed(seed)
    torch.manual_seed(seed) # set random seed
    random.seed(seed)

    ''' init networks '''
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]        # 1
    
    # actor = Network(size_hidden=32, size_in=3,size_out=1)
    # critic = Network(size_hidden=32, size_in=3,size_out=1)
    actor = torch.load(name + "_actor.pth")
    critic = torch.load(name + "_critic.pth")

    actor_optim = optim.Adam(actor.parameters(), lr=lr)
    critic_optim = optim.Adam(critic.parameters(), lr=lr)
    cov_var = torch.full(size=(act_dim,), fill_value=0.5)
    cov_mat = torch.diag(cov_var)

    all_average_reward = []
    all_variance = []

    ''' methods '''
    def rollout():  # collect data for episode times, each episode contains n steps 
        # Batch data. For more details, check function header.
        batch_obs = []
        batch_acts = []
        batch_log_probs = []
        batch_rews = []
        batch_rtgs = []
        batch_lens = []
        batch_reward_average = []
        reward_all = []

        ep_rews = []

        t = 0 # Keeps track of how many timesteps we've run so far this batch

        # Keep simulating until we've run more than or equal to specified timesteps per batch
        while t < timesteps_per_batch:      # like episode
            ep_rews = [] # rewards collected per episode

            # Reset the environment. sNote that obs is short for observation. 
            obs = env.reset()
            done = False

            # Run an episode for a maximum of max_timesteps_per_episode timesteps
            for ep_t in range(max_timesteps_per_episode):       # like steps
                t += 1 # Increment timesteps ran this batch so far

                # Track observations in this batch
                batch_obs.append(obs)

                # Calculate action and make a step in the env. 
                # Note that rew is short for reward.
                action, log_prob = get_action(obs)
                obs, rew, done, _ = env.step(action)

                # Track recent reward, action, and action log probability
                ep_rews.append(rew)
                reward_all.append(rew)
                batch_acts.append(action)
                batch_log_probs.append(log_prob)

                # If the environment tells us the episode is terminated, break
                if done:
                    break

            # Track episodic lengths and rewards
            batch_lens.append(ep_t + 1)
            batch_rews.append(ep_rews)
            batch_reward_average.append(sum(ep_rews)/len(ep_rews))
        batch_obs = torch.from_numpy(np.array(batch_obs))#torch.tensor(batch_obs, dtype=torch.float)
        batch_acts = torch.from_numpy(np.array(batch_acts))#torch.tensor(batch_acts, dtype=torch.float)
        batch_log_probs = torch.from_numpy(np.array(batch_log_probs))#torch.tensor(batch_log_probs, dtype=torch.float)

        batch_rtgs = compute_rtgs(batch_rews)                                                              # ALG STEP 4
        r_all_average = sum(reward_all) / len(reward_all)
        all_variance.append(torch.tensor(batch_rews).var().item())

        return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens, r_all_average

    def compute_rtgs(batch_rews):
        # The rewards-to-go (rtg) per episode per batch to return.
        # The shape will be (num timesteps per episode)
        batch_rtgs = []

        # Iterate through each episode
        for ep_rews in reversed(batch_rews):

            discounted_reward = 0 # The discounted reward so far

            # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
            # discounted return (think about why it would be harder starting from the beginning)
            for rew in reversed(ep_rews):
                discounted_reward = rew + discounted_reward * gamma
                batch_rtgs.insert(0, discounted_reward)
        # Convert the rewards-to-go into a tensor
        batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
        return batch_rtgs

    def get_action(obs):
        # Query the actor network for a mean action
        mean = actor(obs)

        dist = MultivariateNormal(mean, cov_mat)

        # Sample an action from the distribution
        # action = dist.sample()    # TODO: changed
        action = dist.rsample()

        # Calculate the log probability for that action
        log_prob = dist.log_prob(action)

        # Return the sampled action and the log probability of that action in our distribution
        return action.detach().numpy(), log_prob.detach()

    def evaluate(batch_obs, batch_acts):
        # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
        V = critic(batch_obs).squeeze()
        # Calculate the log probabilities of batch actions using most recent actor network.
        # This segment of code is similar to that in get_action()
        mean = actor(batch_obs)
        dist = MultivariateNormal(mean, cov_mat)
        log_probs = dist.log_prob(batch_acts)
        # Return the value vector V of each observation in the batch
        # and log probabilities log_probs of each action in the batch
        return V, log_probs

    print(f"Learning... Running {max_timesteps_per_episode} timesteps per episode, ", end='')
    print(f"{timesteps_per_batch} timesteps per batch for a total of {total_timesteps} timesteps")
    t_so_far = 0 # Timesteps simulated so far
    i_so_far = 0 # Iterations ran so far
    while t_so_far < total_timesteps:                                                                       # ALG STEP 2
        # Autobots, roll out (just kidding, we're collecting our batch simulations here)
        batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens, r_all_average = rollout()                     # ALG STEP 3

        # all_average_reward.append(batch_rtgs.mean().item())
        all_average_reward.append(r_all_average) 

        # Calculate how many timesteps we collected this batch
        # t_so_far += np.sum(batch_lens)
        t_so_far += 1

        # Increment the number of iterations
        i_so_far += 1

        # Calculate advantage at k-th iteration, V from critic, rtgs from actor
        V, _ = evaluate(batch_obs, batch_acts)
        A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5

        A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

        # This is the loop where we update our network for some n epochs
        ''' update n times'''
        for _ in range(n_updates_per_iteration):                                                       # ALG STEP 6 & 7
            # Calculate V_phi and pi_theta(a_t | s_t)
            V, curr_log_probs = evaluate(batch_obs, batch_acts)

            # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
            ratios = torch.exp(curr_log_probs - batch_log_probs)

            # Calculate surrogate losses.
            surr1 = ratios * A_k
            surr2 = torch.clamp(ratios, 1 - clip, 1 + clip) * A_k

            # Calculate actor and critic losses.
            actor_loss = (-torch.min(surr1, surr2)).mean()
            critic_loss = nn.MSELoss()(V, batch_rtgs)

            # Calculate gradients and perform backward propagation for actor network
            actor_optim.zero_grad()
            actor_loss.backward(retain_graph=True)
            actor_optim.step()

            # Calculate gradients and perform backward propagation for critic network
            critic_optim.zero_grad()
            critic_loss.backward()
            critic_optim.step()


        ''' show result '''
        if t_so_far % 100 == 0:
            # print(f"Episode: {t_so_far}/{total_timesteps}, average:{average_score[-1]}")
            print(f"Episode: {t_so_far}/{total_timesteps}, average reward: {all_average_reward[-1]}")
    ''' save result '''
    plt.figure(1)
    x = range(1,len(all_average_reward)+1)

    plt.plot(x,all_average_reward,label = 'average rewards')
    plt.axhline(y=sum(all_average_reward)/len(all_average_reward),c='r', ls="--")
    plt.axhline(y=0,c='g', ls="--")
    plt.legend()
    plt.ylim((-9, 1))
    plt.title(filename)
    plt.xlabel('Episodes')
    plt.savefig(filename + '_collect_rewards' + '.jpg')
    plt.show()
    print(filename)
    torch.save(actor, filename+'_actor' + '.pth')  
    torch.save(critic, filename+'_critic' + '.pth')  
    print("---")

    plt.plot(x,all_variance,label = 'all_variance')
    plt.axhline(y=sum(all_variance)/len(all_variance),c='r', ls="--")
    plt.legend()
    plt.title(filename + "_var")
    plt.xlabel('Episodes')
    plt.savefig(filename +'_var.jpg')
    plt.show()

    ''' save data in file '''
    reward_name = filename + "-rew.csv"
    file = open(reward_name, 'w')
    ### s = ";".join([str(x) for x in average_score])
    s = "\n".join([str(x) for x in all_average_reward])
    file.write(s)   # save value function
    file.write("\n")
    file.write("\n")
    file.close()

    var_name = filename + "-var.csv"
    file = open(var_name, 'w')
    # file.write("varience\n")    # save time
    b = "\n".join([str(x) for x in all_variance])
    file.write(b)

    file.close()



In [None]:

policynames = [
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[0]_0412-1244",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[1]_0412-1457",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[2]_0412-1716",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[3]_0412-1919",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[4]_0412-2115",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[5]_0412-2309",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[6]_0413-1144",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[7]_0413-1429",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[8]_0413-1715",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[9]_0413-1954",
"PPOtanh__md2_total[200]_up[1]_ep[5000]_step[200]_lr[0.005]_cp[0.2]_sd[10]_0413-2211",

"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[0]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[1]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[2]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[3]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[4]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[5]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[6]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[7]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[8]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[9]",
"train-3000/PPOtanh__md2_total[200]_up[1]_ep[3000]_step[200]_lr[0.005]_cp[0.2]_sd[10]"]

for name in policynames:
    PPO_gen(seed=0, profix="r-tanh", timesteps_per_batch = 2000, name = name)
    # try weight = 5 kg
    PPO_gen(seed=0, profix="r-tanh", timesteps_per_batch = 2000, name = name, m = 5)
