In [None]:
#drive-data setup
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
%cd '/content/gdrive/My Drive/sem7/cs6886:sysdl/rl4dlc/sun/ppo_test/'
!ls
current_loc=!pwd
print(current_loc)

In [None]:
#package imports
import torch
import torch.nn as nn
print(torch.__version__)
import gym
import copy
import numpy as np
from matplotlib import pyplot as plt

In [None]:
#rl-agent
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PolicywithValue(nn.Module):
    def __init__(self,obs_space,act_space):
        super(PolicywithValue,self).__init__()
        self.obs_dim,self.act_dim = obs_space,act_space
        policy_h1,policy_h2 = 20,20  #policy_hidden_nodes
        value_h1,value_h2 = 20,20  #value_hidden_nodes
        self.policy_l1 = nn.Linear(self.obs_dim,policy_h1)  #policy_network: policy_layer1
        self.policy_l2 = nn.Linear(policy_h1,policy_h2)
        self.policy_l3 = nn.Linear(policy_h2,self.act_dim) 
        self.value_l1 = nn.Linear(self.obs_dim,value_h1)  #value_network: value_layer1
        self.value_l2 = nn.Linear(value_h1,value_h2)
        self.value_l3 = nn.Linear(value_h2,1)
        self.softmax = nn.Softmax(dim=-1)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self,x,activation='tanh',temp=1):
        if activation=='tanh':
            temp=0.1
            self.actv=self.tanh
        elif activation=='relu':
            self.actv=self.relu
            temp=1
        out_p1=self.actv(self.policy_l1(x))
        out_p2=self.actv(self.policy_l2(out_p1))
        out_p3=self.actv(self.policy_l3(out_p2))
        self.act_probs=self.softmax(torch.div(out_p3,temp))  #,dim=-1)
        out_v1=self.actv(self.value_l1(x))
        out_v2=self.actv(self.value_l2(out_v1))
        self.v_preds=self.value_l3(out_v2) 
        self.act_deterministic=torch.argmax(self.act_probs,axis=-1)
        self.act_stochastic=torch.reshape(torch.multinomial(self.act_probs,num_samples=1),shape=[-1])  #torch.log(self.act_probs)! 

class PPOAgent(nn.Module):
    def __init__(self, policy, old_policy, horizon, learning_rate, epochs,batch_size, gamma, lmbd, clip_value, value_coeff, entropy_coeff, update_freq, memory_size,scheduler=False):
        super(PPOAgent,self).__init__()
        self.policy = policy
        self.old_policy = old_policy
        self.horizon = horizon
        self.batch_size = batch_size
        self.epochs = epochs
        self.optimizer = torch.optim.Adam(self.policy.parameters(),lr=learning_rate,eps=1e-5)
        self.scheduler = scheduler
        if scheduler:
            self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.epochs,gamma=0.999)  
        self.criterion = nn.MSELoss()
        self.gamma = gamma
        self.lmbd = lmbd
        self.clip_value = clip_value
        self.value_coeff = value_coeff
        self.entropy_coeff = entropy_coeff
        self.update_freq = update_freq
        self.memory_size = memory_size
        self.list_observations = []  #memory
        self.list_actions = []
        self.list_v_preds = []
        self.list_rewards = []
        self.count=0        

    def _to_one_hot(self, y, num_classes):
        scatter_dim = len(y.size())
        y_tensor = y.view(*y.size(), -1).type(torch.int64)
        zeros = torch.zeros(*y.size(), num_classes, dtype=y.dtype).to(device)
        return zeros.scatter(scatter_dim, y_tensor, 1)

    def forward(self, observation, stochastic=True):
        self.policy(torch.Tensor(observation).to(device).type(torch.float32))
        act = policy.act_stochastic if stochastic else policy.act_deterministic
        act = act.item()
        v_pred = self.policy.v_preds
        v_pred = v_pred.item()
        if len(self.list_observations)>=self.memory_size:
            self.list_observations=self.list_observations[1:]
            self.list_actions=self.list_actions[1:]
            self.list_v_preds=self.list_v_preds[1:]
        self.list_observations.append(observation)
        self.list_actions.append(act)
        self.list_v_preds.append(v_pred)
        return act, v_pred

    def update(self, reward, terminal):
        if len(self.list_rewards)>=self.memory_size:
            self.list_rewards=self.list_rewards[1:]
        self.list_rewards.append(reward) 
        if terminal == False:
            return
        else:
            self.count+=1
            if self.count%self.update_freq==0:
                print('db: ',self.count,len(self.list_rewards),len(self.list_observations),len(self.list_actions),len(self.list_v_preds))
                assert len(self.list_rewards)==len(self.list_observations)==len(self.list_actions)==len(self.list_v_preds)
                self.list_v_preds_next = self.list_v_preds[1:] + [0]  #v_preds_next from v_preds
                self.list_gaes = self._get_gaes(self.list_rewards, self.list_v_preds, self.list_v_preds_next)  #generalized advantage estimations
                observations = torch.reshape(torch.Tensor(self.list_observations), shape=(-1,self.policy.obs_dim)).to(device).type(torch.float32) 
                actions = torch.Tensor(self.list_actions).type(torch.int32).to(device)
                rewards = torch.Tensor(self.list_rewards).type(torch.float32).to(device)
                v_preds_next = torch.Tensor(self.list_v_preds_next).type(torch.float32).to(device)
                gaes = torch.Tensor(self.list_gaes).type(torch.float32).to(device)
                gaes = (gaes - gaes.mean()) / gaes.std()
                input_samples = [observations, actions, rewards, v_preds_next, gaes]
                self._update_old_policy()  
                if self.horizon != -1:  #sample horizon
                    horizon_indices = torch.Tensor(np.random.randint(low=0, high=observations.shape[0], size=self.horizon)).dtype(torch.int64).to(device)
                    horizon_samples = [input_sample[horizon_indices] for input_sample in input_samples]
                for epoch in range(self.epochs):
                    if self.horizon != -1:
                        batch_indices = torch.Tensor(np.random.randint(low=0, high=self.horizon, size=self.batch_size)).type(torch.int64).to(device)
                        batch_samples = [input_sample[batch_indices] for input_sample in horizon_samples]
                    else:
                        batch_indices = torch.Tensor(np.random.randint(low=0, high=observations.shape[0], size=self.batch_size)).type(torch.int64).to(device)
                        batch_samples = [input_sample[batch_indices] for input_sample in input_samples]
                    self.learn(observations=batch_samples[0], actions=batch_samples[1], rewards=batch_samples[2], v_preds_next=batch_samples[3], gaes=batch_samples[4])
                self.list_observations = []
                self.list_actions = []
                self.list_v_preds = []
                self.list_rewards = []

    def learn(self, observations, actions, rewards, v_preds_next, gaes, stochastic=True):
        self.policy(observations)
        self.old_policy(observations)
        act_probs = self.policy.act_probs
        act_probs_old = self.old_policy.act_probs
#        print('act:1',act_probs)
        act_probs = act_probs * self._to_one_hot(actions,num_classes=act_probs.shape[-1])  #tf.one_hot(indices=self.actions, depth=act_probs.shape[-1])#
#        print('act:2',act_probs)
        act_probs = torch.sum(act_probs, axis=1)
        act_probs_old = act_probs_old * self._to_one_hot(actions,num_classes=act_probs_old.shape[-1])  
        act_probs_old = torch.sum(act_probs_old, axis=1)
        ratios = torch.exp(torch.log(act_probs)-torch.log(act_probs_old))  #clipped surrogate objective
#        print('ratios:',ratios)
        clipped_ratios = torch.clamp(ratios, 1-self.clip_value, 1+self.clip_value)
#        print('clipped_ratios: ',clipped_ratios)
        loss_clip = torch.min(torch.mul(gaes, ratios), torch.mul(gaes, clipped_ratios))
#        print('loss_clip: ',loss_clip)
        loss_clip = torch.mean(loss_clip)
#        print('loss_clip: ',loss_clip)
        v_preds = self.policy.v_preds
#        print('v_preds: ',v_preds)
#        print('inp1: ',torch.unsqueeze(rewards + self.gamma * v_preds_next,axis=-1))
#        print('inp2: ',v_preds)
        loss_v = self.criterion(torch.unsqueeze(rewards + self.gamma * v_preds_next,axis=-1), v_preds)
#        print('sum:',self.policy.act_probs * torch.log(torch.clamp(self.policy.act_probs, 1e-10, 1.0)))
        entropy = -torch.sum(self.policy.act_probs * torch.log(torch.clamp(self.policy.act_probs, 1e-10, 1.0)), axis=1)
#        print('entropy: ',entropy)
        entropy = torch.mean(entropy, axis=0)
#        print('entropy2: ',entropy)
        loss = loss_clip - self.value_coeff * loss_v + self.entropy_coeff * entropy
#        print('loss1: ',loss)
        loss = -loss
#        print('loss2: ',loss)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.scheduler:
            self.scheduler.step()

    def _get_gaes(self, rewards, v_preds, v_preds_next):  #generalized advantage estimate
        deltas = [r + self.gamma * v_next - v for r, v_next, v in zip(rewards, v_preds_next, v_preds)]
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(gaes) - 1)):
            gaes[t] = gaes[t] + self.gamma * self.lmbd * gaes[t+1]
        return gaes

    def _update_old_policy(self):  #update old policy with policy
        for old_param,param in zip(self.old_policy.parameters(),self.policy.parameters()):
            old_param.data=param.data.clone().detach()

In [None]:
#main
render = False
env = gym.make('CartPole-v1')
print(len(env.observation_space.shape))  #box
print(len(env.action_space.shape))  #discrete
if len(env.action_space.shape) >= 1:
    obs_space = env.observation_space.shape[-1]
    act_space = env.action_space.shape[-1]
else:
    obs_space = env.observation_space.shape[-1]
    act_space = env.action_space.n
policy = PolicywithValue(obs_space, act_space).to(device)
old_policy = PolicywithValue(obs_space, act_space).to(device)
agent = PPOAgent(policy, old_policy, 
                 horizon=-1, 
                 learning_rate=0.02,  #1e-4, 
                 epochs=4, 
                 batch_size=128, 
                 gamma=0.99,  #0.95,0.99
                 lmbd=0.95,  #1.0,0.95
                 clip_value=0.2, 
                 value_coeff=1.0, 
                 entropy_coeff=0.01,
                 update_freq=1,
                 memory_size=400).to(device)
count=0
rewards=[]
for e in range(2000):
    avg_reward=0
    observation = env.reset()  #initialize OpenAI Gym environment
    for t in range(500):
        if render:
            env.render()
        # Query the agent for its action decision
        action, value  = agent(observation)
        #print(action, value)
        # Execute the decision and retrieve the current performance
        observation, reward, done, info = env.step(action)
        avg_reward+=reward
        # Modify reward so that negative reward is given when it finishes too early
        # Pass feedback about performance (and termination) to the agent
        agent.update(reward=reward, terminal=done)
        if done:
            print("Episode {} finished after {} timesteps with reward {}".format(e+1, t+1, avg_reward))
            rewards.append(avg_reward)
            if avg_reward>195.0:
                count+=1
            else:
                count=0
            if count==100:
                print('done!')
#                exit(0)
            break

In [None]:
from matplotlib import pyplot as plt
plt.plot(rewards)
plt.show()

In [None]:
env.observation_space.shape[-1]

In [None]:
policy.act_space

In [None]:
policy.act_probs

In [None]:
env.action_space.sample().shape

In [None]:
import torch
import gym
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
import pandas as pd


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import torch
import torch.nn as nn


class MlpPolicy(nn.Module):
    def __init__(self, action_size, input_size=4):
        super(MlpPolicy, self).__init__()
        self.action_size = action_size
        self.input_size = input_size
        self.fc1 = nn.Linear(self.input_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3_pi = nn.Linear(24, self.action_size)
        self.fc3_v = nn.Linear(24, 1)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=-1)

    def pi(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3_pi(x)
        return self.softmax(x)

    def v(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3_v(x)
        return x

class AgentConfig:
    gamma = 0.99  #Learning parameters
    plot_every = 10
    update_freq = 1
    k_epoch = 3
    learning_rate = 0.02
    lmbda = 0.95
    eps_clip = 0.2
    v_coef = 1
    entropy_coef = 0.01

    # Memory
    memory_size = 400

    train_cartpole = True

class Agent(AgentConfig):
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.action_size = self.env.action_space.n  # 2 for cartpole
        if self.train_cartpole:
            self.policy_network = MlpPolicy(action_size=self.action_size).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)    
#        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=self.k_epoch,gamma=0.999)
        self.memory = {
            'state': [], 'action': [], 'reward': [], 'next_state': [], 'action_prob': [], 'terminal': [], 'count': 0,
            'advantage': [], 'td_target': torch.FloatTensor([])
        }
        self.loss = 0
        self.criterion = nn.MSELoss()

    def new_random_game(self):
        self.env.reset()
        action = self.env.action_space.sample()
        screen, reward, terminal, info = self.env.step(action)
        return screen, reward, action, terminal

    def train(self):
        episode = 0
        step = 0
        reward_history = []
        avg_reward = []
        solved = False

        # A new episode
        while not solved:
            start_step = step
            episode += 1
            episode_length = 0

            # Get initial state
            state, reward, action, terminal = self.new_random_game()
            current_state = state
            total_episode_reward = 1

            # A step in an episode
            while not solved:
                step += 1
                episode_length += 1

                # Choose action
                prob_a = self.policy_network.pi(torch.FloatTensor(current_state).to(device))
                # print(prob_a)
                action = torch.distributions.Categorical(prob_a).sample().item()

                # Act
                state, reward, terminal, _ = self.env.step(action)
                new_state = state

                reward = -1 if terminal else reward

                self.add_memory(current_state, action, reward/10.0, new_state, terminal, prob_a[action].item())

                current_state = new_state
                total_episode_reward += reward

                if terminal:
                    episode_length = step - start_step
                    reward_history.append(total_episode_reward)
                    avg_reward.append(sum(reward_history[-10:])/10.0)

                    self.finish_path(episode_length)

                    if len(reward_history) > 100 and sum(reward_history[-100:-1]) / 100 >= 195:
                        solved = True
                        exit(0)

                    print('episode: %.2f, total step: %.2f, last_episode length: %.2f, last_episode_reward: %.2f, '
                          'loss: %.4f' % (episode, step, episode_length, total_episode_reward, self.loss))#,self.scheduler.get_lr()[0]))

                    self.env.reset()

                    break

            if episode % self.update_freq == 0:
                for _ in range(self.k_epoch):
                    self.update_network()

            if episode % self.plot_every == 0:
                plot_graph(reward_history, avg_reward)

        self.env.close()

    def update_network(self):
        # get ratio
        pi = self.policy_network.pi(torch.FloatTensor(self.memory['state']).to(device))
        new_probs_a = torch.gather(pi, 1, torch.tensor(self.memory['action']).to(device))
        old_probs_a = torch.FloatTensor(self.memory['action_prob']).to(device)
        ratio = torch.exp(torch.log(new_probs_a) - torch.log(old_probs_a))

        # surrogate loss
        surr1 = ratio * torch.FloatTensor(self.memory['advantage']).to(device)
        surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * torch.FloatTensor(self.memory['advantage']).to(device)
        pred_v = self.policy_network.v(torch.FloatTensor(self.memory['state']).to(device))
        v_loss = 0.5 * (pred_v - self.memory['td_target']).pow(2)  # Huber loss
        entropy = torch.distributions.Categorical(pi).entropy()
        entropy = torch.tensor([[e] for e in entropy]).to(device)
        self.loss = (-torch.min(surr1, surr2) + self.v_coef * v_loss.to(device) - self.entropy_coef * entropy).mean()

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()
#        self.scheduler.step()

    def add_memory(self, s, a, r, next_s, t, prob):
        if self.memory['count'] < self.memory_size:
            self.memory['count'] += 1
        else:
            self.memory['state'] = self.memory['state'][1:]
            self.memory['action'] = self.memory['action'][1:]
            self.memory['reward'] = self.memory['reward'][1:]
            self.memory['next_state'] = self.memory['next_state'][1:]
            self.memory['terminal'] = self.memory['terminal'][1:]
            self.memory['action_prob'] = self.memory['action_prob'][1:]
            self.memory['advantage'] = self.memory['advantage'][1:]
            self.memory['td_target'] = self.memory['td_target'][1:]

        self.memory['state'].append(s)
        self.memory['action'].append([a])
        self.memory['reward'].append([r])
        self.memory['next_state'].append(next_s)
        self.memory['terminal'].append([1 - t])
        self.memory['action_prob'].append(prob)

    def finish_path(self, length):
        state = self.memory['state'][-length:]
        reward = self.memory['reward'][-length:]
        next_state = self.memory['next_state'][-length:]
        terminal = self.memory['terminal'][-length:]

        td_target = torch.FloatTensor(reward).to(device) + \
                    self.gamma * self.policy_network.v(torch.FloatTensor(next_state).to(device)) * torch.FloatTensor(terminal).to(device)
        delta = td_target - self.policy_network.v(torch.FloatTensor(state).to(device))
        delta = delta.detach().cpu().numpy()

        # get advantage
        advantages = []
        adv = 0.0
        for d in delta[::-1]:
            adv = self.gamma * self.lmbda * adv + d[0]
            advantages.append([adv])
        advantages.reverse()

        if self.memory['td_target'].shape == torch.Size([1, 0]):
            self.memory['td_target'] = td_target.data
        else:
            self.memory['td_target'] = torch.cat((self.memory['td_target'].to(device), td_target.data.to(device)), dim=0)
        self.memory['advantage'] += advantages


def plot_graph(reward_history, avg_reward):
    df = pd.DataFrame({'x': range(len(reward_history)), 'Reward': reward_history, 'Average': avg_reward})
    plt.style.use('seaborn-darkgrid')
    palette = plt.get_cmap('Set1')

    plt.plot(df['x'], df['Reward'], marker='', color=palette(1), linewidth=0.8, alpha=0.9, label='Reward')
    # plt.plot(df['x'], df['Average'], marker='', color='tomato', linewidth=1, alpha=0.9, label='Average')

    # plt.legend(loc='upper left')
    plt.title("CartPole", fontsize=14)
    plt.xlabel("episode", fontsize=12)
    plt.ylabel("score", fontsize=12)

    plt.savefig('score.png')

In [None]:
def main():
    agent = Agent()
    agent.train()


if __name__ == '__main__':
    main()

In [None]:
print(device)