# Train REINFOECE

In [97]:
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import torch.optim as optim
import matplotlib.pyplot as plt
import gym
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import random

# state value baseline

In [98]:
class StateValue(nn.Module):
    def __init__(self,s_size=3, h_size=64, out_size = 1) -> None:
        super().__init__()
        self.input_layer = nn.Linear(s_size, h_size)
        self.output_layer = nn.Linear(h_size, out_size)
        
    def forward(self, state):
        #input layer
        # print(state)
        x = torch.from_numpy(state).float().unsqueeze(0)
        # print(x)
        x = self.input_layer(x)
        #activiation relu
        x = F.relu(x)
        #get state value
        x = self.output_layer(x)
        
        ''' not hardtanh'''

        return x

In [101]:
def angle_normalize(x):
    return ((x + torch.pi) % (2 * torch.pi)) - torch.pi

def angle_normalize_np(x):
    return ((x + np.pi) % (2 * np.pi)) - np.pi


# policy for continous action

In [102]:
class Network(nn.Module):
	def __init__(self, size_in, size_out,size_hidden):
		super(Network, self).__init__()
		self.layer1 = nn.Linear(size_in, size_hidden)
		self.layer2 = nn.Linear(size_hidden, size_out)
		# self.layer2 = nn.Linear(size_hidden, size_hidden)
		# self.layer3 = nn.Linear(size_hidden, size_out)

	def forward(self, obs):
		# Convert observation to tensor if it's a numpy array
		if isinstance(obs, np.ndarray):
			obs = torch.tensor(obs, dtype=torch.float)

		m = nn.Tanh()
		activation1 = F.relu(self.layer1(obs))
		output = 2 * m(self.layer2(activation1))
		# activation2 = F.relu(self.layer2(activation1))
		# output = self.layer3(activation2)
		# output = 2 * m(activation2)

		return output

# the main training algorithm for REINFORCE main part with Gym

In [103]:
import matplotlib.pyplot as plt
import time
def conti(ns, n_episodes = 5000, state_value = False, white = False, number = 0, lr_n = 2, lr_baseline = 1, h_size_policy = 64, h_size_baseline = 64,\
    learning_rate_policy = None, learning_rate_baseline = None):
    """ main training part for REINFORCE

    Args:
        ns (int): number of random seed
        n_episodes (int, optional): episodes. Defaults to 5000.
        state_value (bool, optional): if use baseline technique state value. Defaults to False.
        white (bool, optional): if use whitening technique. Defaults to False.
        number (int, optional): the number of training, only for distinguish filename. Defaults to 0.
        lr_n (int, optional): learning rate of optimizer 1e-n. Defaults to 2.
        lr_baseline (int, optional): learning rate of baseline statevalue, 1e-n. Defaults to 1.
        h_size_policy (int, optional): size of hidden layer of policy. Defaults to 64.
        h_size_baseline (int, optional): size of hidden layer of baseline statevalue. Defaults to 64.
        learning_rate_policy (_type_, optional): specfic learning rate for policy, not 1e-n, but the whole value. Defaults to None.
        learning_rate_baseline (_type_, optional): specfic learning rate for baseline state value, not 1e-n, but the whole value. Defaults to None.
    """

    ''' configuration '''
    time_1 = time.time()
    n_episodes = n_episodes
    max_t = 200
    gamma = 0.9
    h_size = h_size_policy

    lr = 10**(-lr_n)
    conti_baseline_lr = 10**(-lr_baseline)

    if learning_rate_policy != None:
        lr = learning_rate_policy
        lr_n = learning_rate_policy
    if learning_rate_baseline != None:
        conti_baseline_lr = learning_rate_baseline
        lr_baseline = learning_rate_baseline

    np.random.seed(ns)
    torch.manual_seed(ns) # set random seed
    random.seed(ns)

    conti_state_value_hsize = h_size_baseline
    # conti_baseline_lr = 1e-1

    # isContiStateValue = True
    # isContiStateValue = False

    isContiStateValue = state_value

    whitening = white
    # whitening = True
    # whitening = False
    

    # filename = f"{n_episodes}-{max_t}-{lr_n}-hs{h_size}-g{str(gamma*10)}"
    # filename = "Gym-ori-" + filename
    filename = "Gym-Reinforce-ori_model-"
    if isContiStateValue:
        filename = filename + f'-baseline-hs{conti_state_value_hsize}-svlr{lr_baseline}'
    
    if whitening:
        filename = filename + "-w"

    # filename = filename + "-" + f"sd{ns}-" + str(number)
    filename = filename + "-" + f"sd{ns}"
    print(filename)

    ''' init envionment, policy '''
    env = gym.make("Pendulum-v1")
    env.seed(ns)
    # policy = Policy_conti(hidden_size=h_size,num_inputs=3,action_space=env.action_space)
    policy = Policy_conti_np(hidden_size=h_size,num_inputs=3,action_space=env.action_space)
    # policy = Network(size_hidden=h_size,size_in=3,size_out=env.action_space)
    
    if isContiStateValue:    # baseline
        conti_state_value_policy = StateValue(h_size=conti_state_value_hsize)
        optimizer_state_value = optim.Adam(conti_state_value_policy.parameters(), lr=conti_baseline_lr)


    average_score = []
    average_state_value = []
    all_variance = []

    optimizer = optim.Adam(policy.parameters(), lr=lr)

    ''' training process'''
    for i_episode in range(1, n_episodes + 1):
        # print(f"----- [{i_episode}] -----")
        episode_log_probs = []
        episode_rewards = []
        episode_rewards_original = []
        episode_entropies = []
        conti_episode_state_values = []
        state = env.reset()   
        ''' start each step'''
        for t in range(max_t):
            action, log_prob, entropy = policy(state)
            
            if isContiStateValue:   # baseline
                state_value = conti_state_value_policy(state)       # baseline
                conti_episode_state_values.append(state_value)      # baseline
            state, reward, done, _ = env.step(action.detach().numpy()[0])
            # state, reward, done, _ = env.step(action.numpy())

            episode_entropies.append(entropy)
            episode_log_probs.append(log_prob)
            episode_rewards.append(reward)
            episode_rewards_original.append(reward)
            ''' end each step '''

        ''' save average value, before operation'''
        average_score.append(sum(episode_rewards_original).item()/max_t)
        if isContiStateValue:
            average_state_value.append(sum(conti_episode_state_values).item()/len(conti_episode_state_values))
            sv = torch.FloatTensor(conti_episode_state_values)

        ''' process reward with whitening '''
        if whitening:
            episode_rewards = torch.FloatTensor(episode_rewards)
            episode_rewards.requires_grad = True
            all_variance.append(episode_rewards.var().item())
            episode_rewards = (episode_rewards - episode_rewards.mean())/episode_rewards.std()

        ''' update policy '''
        R = torch.zeros(1, 1)
        baseline = torch.zeros(1, 1)
        loss = 0
        state_value_delta = []
        R_list = []

        for i in reversed(range(len(episode_rewards))):
            R = gamma * R + episode_rewards[i]  # R is value-function of each step
            # baseline = gamma * baseline + sv[i]
            if isContiStateValue:
                # r = R - conti_episode_state_values[i]
                # r = R - baseline
                r = R - sv[i] # r is (value-function) - (state value function)
                state_value_delta.append(r) # used for update policy
                R_list.append(R)        # from the last to the first, need reversed later, used for update the state-value function
                ''' update policy '''
                loss = loss - (episode_log_probs[i]*(r.expand_as(episode_log_probs[i]))).sum() - (0.0001*episode_entropies[i]).sum()

                # ''' nop '''
                # loss = loss - ((r.expand_as(episode_log_probs[i]))).sum() - (0.0001*episode_entropies[i]).sum()
            else:   # if not use baseline: only use R
                loss = loss - (episode_log_probs[i]*(R.expand_as(episode_log_probs[i]))).sum() - (0.0001*episode_entropies[i]).sum()
                # loss = loss - (episode_log_probs[i]*(R)).sum() - (0.0001*episode_entropies[i]).sum()
                # ''' nop '''
                # loss = loss - ((R.expand_as(episode_log_probs[i]))).sum() - (0.0001*episode_entropies[i]).sum()

        loss = loss / len(episode_rewards)
        # print(R_list)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        if i_episode % 1000 == 0:
            # print(f"Episode: {i_episode}/{n_episodes}, average:{average_score[-1]}, average_state_value: {average_state_value[-1]}")
            
            if isContiStateValue:
                print(f"Episode: {i_episode}/{n_episodes}, average:{average_score[-1]}, average_state_value: {average_state_value[-1]}")
            else:
                print(f"Episode: {i_episode}/{n_episodes}, average:{average_score[-1]}")

    ''' show results '''
    time_2 = time.time()
    x = range(1,len(average_score)+1)
    
    if isContiStateValue:
        plt.plot(x,average_state_value,label = 'average state value',c='r')
    plt.plot(x,average_score,label = 'average reward')
    plt.axhline(y=sum(average_score)/len(average_score),c='r', ls="--")
    plt.axhline(y=0,c='g', ls="--")
    plt.legend()
    plt.ylim((-9, 1))
    plt.text(1, -8, time_2 - time_1 )
    plt.title(filename)
    plt.xlabel('Episodes')
    plt.savefig(filename +'.jpg')
    plt.show()


    plt.plot(x,all_variance,label = 'all_variance')
    plt.axhline(y=sum(all_variance)/len(all_variance),c='r', ls="--")
    plt.legend()
    plt.title(filename + "_var")
    plt.xlabel('Episodes')
    plt.savefig(filename +'_var.jpg')
    plt.show()
    
    print(filename)
    torch.save(policy, filename+'.pth')  
    print("time:", time_1 - time_2)

    ''' save training curve in file '''
    reward_name = filename + "-rew.csv"
    file = open(reward_name, 'w')
    s = "\n".join([str(x) for x in average_score])
    file.write(s)   # save value function
    file.write("\n")
    # file.write(str(time_2 - time_1))    # save time
    file.write("\n")
    file.close()

    ''' save variance in file '''
    var_name = filename + "-var.csv"
    file = open(var_name, 'w')
    # file.write("varience\n")    # save time
    b = "\n".join([str(x) for x in all_variance])
    file.write(b)
    if isContiStateValue:
        file.write("statevalue")    # save if used state value as baseline
        file.write("\n")
        s = ";".join([str(x) for x in average_state_value]) # save the result of state value 
        file.write(s)
    file.close()
    print("---")

# Training processes 

In [104]:
# for seed in range(0,11):
#     conti(seed, n_episodes = 5000, state_value=False, white=True, number=0, lr_n=3, h_size_policy=50)


# NN suitable for differentiable model

In [105]:
class Network(nn.Module):
	def __init__(self, size_in, size_out,size_hidden):
		super(Network, self).__init__()
		self.layer1 = nn.Linear(size_in, size_hidden)
		self.layer2 = nn.Linear(size_hidden, size_out)

	def forward(self, obs):
		# Convert observation to tensor if it's a numpy array
		if isinstance(obs, np.ndarray):
			obs = torch.tensor(obs, dtype=torch.float)

		m = nn.Tanh()
		activation1 = F.relu(self.layer1(obs))
		output = 2 * m(self.layer2(activation1))	# the mean value

		return output

# Train REINFORCE with differentiable model

In [111]:
# conti_mean_model
import matplotlib.pyplot as plt
import time
from han_pendulum2 import Han_Pendulum2 # differentiable model
def conti_mean_model(ns, n_episodes = 5000, state_value = False, white = False, number = 0, lr_n = 2, lr_baseline = 1, h_size_policy = 64, h_size_baseline = 64,\
    learning_rate_policy = None, learning_rate_baseline = None, prefix = "", method = 0):
    """train REINFORCE with differentiable model

    Args:
        ns (int): number of random seed
        n_episodes (int, optional): number of episodes. Defaults to 5000.
        state_value (bool, optional): if use baseline technique state value. Defaults to False.
        white (bool, optional): if use whitening technique. Defaults to False.
        number (int, optional): only for distinguish filename. Defaults to 0.
        lr_n (int, optional): learning rate of optimizer 1e-n. Defaults to 2.
        lr_baseline (int, optional): learning rate of baseline statevalue, 1e-n. Defaults to 1.
        h_size_policy (int, optional): size of hidden layer of policy. Defaults to 64.
        h_size_baseline (int, optional): size of hidden layer of baseline statevalue. Defaults to 64.
        learning_rate_baseline (_type_, optional): specfic learning rate for baseline state value, not 1e-n, but the whole value. Defaults to None.
        prefix (str, optional): prefix of filename. Defaults to "".
        method (int, optional): use which function to calculate gradient. Defaults to 0.
    """

    ''' configuration '''
    time_1 = time.time()
    n_episodes = n_episodes
    max_t = 200
    gamma = 0.9
    h_size = h_size_policy

    lr = 10**(-lr_n)
    conti_baseline_lr = 10**(-lr_baseline)

    if learning_rate_policy != None:
        lr = learning_rate_policy
        lr_n = learning_rate_policy
    if learning_rate_baseline != None:
        conti_baseline_lr = learning_rate_baseline
        lr_baseline = learning_rate_baseline

    np.random.seed(ns)
    torch.manual_seed(ns) # set random seed
    random.seed(ns)
    conti_state_value_hsize = h_size_baseline
    isContiStateValue = state_value
    whitening = white


    filename = f"{n_episodes}-{max_t}-{lr_n}-hs{h_size}-"
    prefix = prefix + "_"
    filename = "R-tanh-" + f"{method}_" + prefix + filename

    if whitening:
        filename = filename + "-wh"

    filename = filename + f"_sd{ns}_" + str(number)
    print(filename)
    policy = Network(size_hidden=h_size,size_in=3,size_out=1)
    
    if isContiStateValue:    # baseline
        conti_state_value_policy = StateValue(h_size=conti_state_value_hsize)
        optimizer_state_value = optim.Adam(conti_state_value_policy.parameters(), lr=conti_baseline_lr)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    ''' finished configuration '''

    average_score = []
    average_state_value = []
    all_variance = []

    ''' init env '''
    # env = gym.make("Pendulum-v1")
    env = Han_Pendulum2(seed=ns)

    ''' training phase '''
    for i_episode in range(1, n_episodes + 1):
        # print(f"----- [{i_episode}] -----")
        episode_log_probs = []
        episode_rewards = []
        episode_rewards_original = []
        episode_entropies = []
        conti_episode_state_values = []
        obs = env.reset()   

        ''' start each step'''
        for t in range(max_t):
            ''' establish a normal distributaion and sample one action with rsample()'''
            ''' also need .detach() to compare with Gym '''
            mean = policy(obs)
            normal = Normal(mean, 1)
            # action = normal.sample()
            action = normal.rsample()
            # action = action.detach()
            # log_prob = normal.log_prob(action)
            log_prob = normal.log_prob(action.detach())

            obs, reward, done, _ = env.step(action)

            # to compare with GYM, if do not use gradient of reward
            # obs = obs.detach()
            # reward = reward.detach()
            # state, reward, done, _ = env.step(action.detach().numpy())
            # state, reward, done, _ = env.step(action.numpy())

            # episode_entropies.append(entropy)
            episode_log_probs.append(log_prob)
            episode_rewards.append(reward)
            episode_rewards_original.append(reward)
            
            # state = torch.Tensor([state])
            ''' end each step '''

        ''' save average value, before operation'''
        average_score.append(sum(episode_rewards_original).item()/max_t)

        ''' process reward with whitening '''
        if whitening:
            # episode_rewards = torch.tensor(episode_rewards,requires_grad=True).float()
            episode_rewards = torch.FloatTensor(episode_rewards)
            episode_rewards.requires_grad = True
            all_variance.append(episode_rewards.var().item())
            episode_rewards = (episode_rewards - episode_rewards.mean())/episode_rewards.std()

        ''' update policy '''
        R = torch.zeros(1, 1)
        baseline = torch.zeros(1, 1)
        loss = 0
        state_value_delta = []
        R_list = []

        for i in reversed(range(len(episode_rewards))):
            R = gamma * R + episode_rewards[i]  # R is value-function of each step

            ''' try different loss function '''
            if method == 0:
                loss = loss - (episode_log_probs[i]*(R.detach())).sum()

            elif method == 1:
                loss = loss - (episode_log_probs[i]*(R)).sum()

            elif method == 2:
                loss = loss - (torch.exp(episode_log_probs[i])*(R)).sum()

            elif method == 3:
                loss = loss - (R).sum()

            elif method == 4:
                loss = loss - (episode_log_probs[i]*R.detach() + R).sum()

        loss = loss / len(episode_rewards)
        # print(R_list)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        if i_episode % 1000 == 0:
            print(f"Episode: {i_episode}/{n_episodes}, average:{average_score[-1]}")

    ''' save plot '''
    time_2 = time.time()
    x = range(1,len(average_score)+1)
    
    if isContiStateValue:
        plt.plot(x,average_state_value,label = 'average state value',c='r')
    plt.plot(x,average_score,label = 'average reward')
    plt.axhline(y=sum(average_score)/len(average_score),c='r', ls="--")
    plt.axhline(y=0,c='g', ls="--")
    plt.legend()
    plt.ylim((-9, 1))
    plt.text(1, -8, time_2 - time_1 )
    plt.title(filename)
    plt.xlabel('Episodes')
    plt.savefig(filename +'.jpg')
    plt.show()


    plt.plot(x,all_variance,label = 'all_variance')
    plt.axhline(y=sum(all_variance)/len(all_variance),c='r', ls="--")
    plt.legend()
    plt.title(filename + "_var")
    plt.xlabel('Episodes')
    plt.savefig(filename +'_var.jpg')
    plt.show()
    
    ''' save data in file '''
    print(filename)
    torch.save(policy, filename+'.pth')  
    print("time:", time_1 - time_2)
    ''' save data in file '''
    file_name = filename + ".csv"
    file = open(file_name, 'w')
    s = "\n".join([str(x) for x in average_score])
    file.write(s)   # save value function
    file.write("\n")
    # file.write(str(time_2 - time_1))    # save time
    file.write("\n")
    file.write("\n")
    file.write("varience")    # save time
    file.write("\n")
    file.write("\n")
    b = "\n".join([str(x) for x in all_variance])
    file.write(b)
    if isContiStateValue:
        file.write("statevalue")    # save if used state value as baseline
        file.write("\n")
        s = ";".join([str(x) for x in average_state_value]) # save the result of state value 
        file.write(s)
    file.close()
    print("---")