In [None]:
#drive-data setup
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
%cd '/content/gdrive/My Drive/sem7/cs6886:sysdl/rl4dlc/sun/expts/'
!ls
current_loc=!pwd
print(current_loc)

In [None]:
#package imports
import torch
import torch.nn as nn
print(torch.__version__)
import gym
import copy
import random
import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
#rl-agent
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PolicywithValue(nn.Module):
    def __init__(self,obs_space,act_space):
        super(PolicywithValue,self).__init__()
        assert len(obs_space)==1
        self.obs_dim,self.act_space = obs_space[-1],act_space
        policy_c_ = [self.obs_dim,20,20]  #policy_common_nodes
        policy_uc_ = []  #policy_un-common_nodes
        for a in range(len(act_space)):
            policy_uc_.append([policy_c_[-1]]+[act_space[a]]*3)
        value_ = [self.obs_dim,20,20,20,1]  #value_nodes
        self.policy_c,self.policy_uc=nn.ModuleList([]),nn.ModuleList([])
        for pc in range(len(policy_c_)-1):  #policy_network_layers(common)
            self.policy_c.append(nn.Linear(policy_c_[pc],policy_c_[pc+1]))
        for a in range(len(act_space)):  #policy_network_layers(un-common)
            self.policy_uc.append(nn.ModuleList([]))
            for puc in range(len(policy_uc_[a])-1):
                self.policy_uc[a].append(nn.Linear(policy_uc_[a][puc],policy_uc_[a][puc+1]))
        self.value = nn.ModuleList([])
        for v in range(len(value_)-1):  #value_network_layers
            self.value.append(nn.Linear(value_[v],value_[v+1]))
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self,x,activation='tanh',temp=1):
        if activation=='tanh':
            self.actv=self.tanh
            temp=0.1
        elif activation=='relu':
            self.actv=self.relu
            temp=1
        out_pc,out_v=x.clone(),x.clone()
        for pc in range(len(self.policy_c)):
            out_pc=self.actv(self.policy_c[pc](out_pc))
        self.act_probs,self.act_deterministic,self.act_stochastic=[],[],[]
        for a in range(len(self.policy_uc)):
            out_puc=out_pc.clone()
            for puc in range(len(self.policy_uc[a])):
                out_puc=self.actv(self.policy_uc[a][puc](out_puc))
            act_probs=torch.softmax(torch.div(out_puc,temp),dim=-1)
            self.act_probs.append(act_probs)
            self.act_deterministic.append(torch.argmax(act_probs,axis=-1))
            self.act_stochastic.append(torch.reshape(torch.multinomial(act_probs,num_samples=1),shape=[-1]))  #!torch.log(self.act_probs)
        for v in range(len(self.value)-1):
            out_v=self.actv(self.value[v](out_v))
        self.v_preds=self.value[-1](out_v)  #linear

class PPOAgent(nn.Module):
    def __init__(self, policy, old_policy, horizon, learning_rate, epochs,batch_size, gamma, lmbd, clip_value, value_coeff, entropy_coeff, update_freq, memory_size, schedule=False):
        super(PPOAgent,self).__init__()
        self.policy = policy
        self.old_policy = old_policy
        self.horizon = horizon  #hyper_parameters
        self.batch_size = batch_size
        self.epochs = epochs
        self.optimizer = torch.optim.Adam(self.policy.parameters(),lr=learning_rate,eps=1e-5)
        self.schedule = schedule
        if schedule:
            self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=self.epochs,gamma=0.999)  
        self.criterion = nn.MSELoss()
        self.gamma = gamma
        self.lmbd = lmbd
        self.clip_value = clip_value
        self.value_coeff = value_coeff
        self.entropy_coeff = entropy_coeff
        self.update_freq = update_freq
        self.memory_size = memory_size
        self.list_observations = []  #memory_elements
        self.list_actions = []
        self.list_v_preds = []
        self.list_rewards = []
        self.count=0

    def _to_one_hot(self, y, num_classes):
        scatter_dim = len(y.size())
        y_tensor = y.view(*y.size(), -1).type(torch.int64)
        zeros = torch.zeros(*y.size(), num_classes, dtype=y.dtype).to(device)
        return zeros.scatter(scatter_dim, y_tensor, 1)

    def forward(self, observation, stochastic=True):
        self.policy(torch.Tensor(observation).to(device).type(torch.float32))
        actions = self.policy.act_stochastic if stochastic else self.policy.act_deterministic
        for a in range(len(actions)):
            actions[a] = actions[a].item()
        v_pred = self.policy.v_preds
        for v in range(len(v_pred)):
            v_pred[v] = v_pred[v].item()
        if len(self.list_observations)>=self.memory_size:
            self.list_observations=self.list_observations[1:]
            self.list_actions=self.list_actions[1:]
            self.list_v_preds=self.list_v_preds[1:]
        self.list_observations.append(observation)
        self.list_actions.append(actions)
        self.list_v_preds.append(v_pred)
        return actions, v_pred

    def update(self, reward, terminal):
        if len(self.list_rewards)>=self.memory_size:
            self.list_rewards=self.list_rewards[1:]
        self.list_rewards.append(reward) 
        if terminal == False:
            return
        else:
            self.count+=1
            if self.count%self.update_freq==0:
                assert len(self.list_rewards)==len(self.list_observations)==len(self.list_actions)==len(self.list_v_preds) and len(self.list_v_preds)<=self.memory_size
                self.list_v_preds_next = self.list_v_preds[1:] + [0]  #v_preds_next from v_preds
                self.list_gaes = self._get_gaes(self.list_rewards, self.list_v_preds, self.list_v_preds_next)  #generalized advantage estimations
                observations = torch.reshape(torch.Tensor(self.list_observations), shape=(-1,self.policy.obs_dim)).to(device).type(torch.float32) 
                actions = torch.Tensor(self.list_actions).type(torch.int32).to(device)
                rewards = torch.Tensor(self.list_rewards).type(torch.float32).to(device)
                v_preds_next = torch.Tensor(self.list_v_preds_next).type(torch.float32).to(device)
                gaes = torch.Tensor(self.list_gaes).type(torch.float32).to(device)
                gaes = (gaes - gaes.mean()) / gaes.std()
                input_samples = [observations, actions, rewards, v_preds_next, gaes]
                self._update_old_policy()  #update old_policy with policy params
                if self.horizon != -1:  #sample horizon
                    horizon_indices = torch.Tensor(np.random.randint(low=0, high=observations.shape[0], size=self.horizon)).dtype(torch.int64).to(device)
                    horizon_samples = [input_sample[horizon_indices] for input_sample in input_samples]
                for epoch in range(self.epochs):
                    if self.horizon != -1:
                        batch_indices = torch.Tensor(np.random.randint(low=0, high=self.horizon, size=self.batch_size)).type(torch.int64).to(device)
                        batch_samples = [input_sample[batch_indices] for input_sample in horizon_samples]
                    else:
                        batch_indices = torch.Tensor(np.random.randint(low=0, high=observations.shape[0], size=self.batch_size)).type(torch.int64).to(device)
                        batch_samples = [input_sample[batch_indices] for input_sample in input_samples]
                    self.learn(observations=batch_samples[0], actions=batch_samples[1], rewards=batch_samples[2], v_preds_next=batch_samples[3], gaes=batch_samples[4])
                self.list_observations = []
                self.list_actions = []
                self.list_v_preds = []
                self.list_rewards = []

    def learn(self, observations, actions, rewards, v_preds_next, gaes, stochastic=True):
        self.policy(observations)
        self.old_policy(observations)
        loss=[]
        for i in range(len(self.policy.act_space)):
            act_probs = self.policy.act_probs[i]
            act_probs_old = self.old_policy.act_probs[i]
            act_probs = act_probs * self._to_one_hot(actions[:,i],num_classes=act_probs.shape[-1])
            act_probs = torch.sum(act_probs, axis=-1)
            act_probs_old = act_probs_old * self._to_one_hot(actions[:,i],num_classes=act_probs_old.shape[-1])  
            act_probs_old = torch.sum(act_probs_old, axis=-1)
            ratios = torch.exp(torch.log(act_probs)-torch.log(act_probs_old))  
            clipped_ratios = torch.clamp(ratios, 1-self.clip_value, 1+self.clip_value)
            loss_clip = torch.min(torch.mul(gaes, ratios), torch.mul(gaes, clipped_ratios))  #clipped surrogate objective
            loss_clip = torch.mean(loss_clip)
            entropy = -torch.sum(self.policy.act_probs[i] * torch.log(torch.clamp(self.policy.act_probs[i], 1e-10, 1.0)), axis=1)
            entropy = torch.mean(entropy, axis=0)
            loss.append(loss_clip + self.entropy_coeff * entropy)
        v_preds = self.policy.v_preds
        loss_v = self.criterion(torch.unsqueeze(rewards + self.gamma * v_preds_next,axis=-1), v_preds)
        loss = sum(loss) - self.value_coeff * loss_v
        loss = -loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.schedule:
            self.scheduler.step()

    def _get_gaes(self, rewards, v_preds, v_preds_next):  #generalized advantage estimate
        deltas = [r + self.gamma * v_next - v for r, v_next, v in zip(rewards, v_preds_next, v_preds)]
        gaes = torch.Tensor(deltas).clone()
        for t in reversed(range(len(gaes) - 1)):
            gaes[t] = gaes[t] + self.gamma * self.lmbd * gaes[t+1]
        return gaes

    def _update_old_policy(self):  #update old policy with policy
        for old_param,param in zip(self.old_policy.parameters(),self.policy.parameters()):
            old_param.data=param.data.clone().detach()

In [None]:
#main
import env as e
env = e.envi()  
policy = PolicywithValue(env.state_shape,env.action_shape).to(device)
old_policy = PolicywithValue(env.state_shape,env.action_shape).to(device)
agent = PPOAgent(policy, old_policy, 
                 horizon=-1, 
                 learning_rate=1e-3,  #0.02,1e-4, 
                 epochs=3, 
                 batch_size=64, 
                 gamma=0.9,  #0.95,0.99
                 lmbd=0.99,  #1.0,0.95
                 clip_value=0.3, 
                 value_coeff=1.0, 
                 entropy_coeff=0.1,
                 update_freq=1,
                 memory_size=1000).to(device)

for e in range(10):  #128,30
    avg_reward=0
    observation = env.reset() 
    for t in range(500):  
        action, value  = agent(list(observation.values()))
        observation, reward, done, info = env.step(action,e,t)
        avg_reward+=reward
        print('e:',e,',t:',t,',action:',action,',state:',observation,',reward:',reward)
        agent.update(reward=reward, terminal=done or (t==1))
        if done or t==99:
            print("Episode {} finished after {} timesteps with reward {}".format(e+1, t+1, avg_reward/(t+1)))
            break

In [None]:
#rewards_stats
#df=pd.read_csv(env.stats_dir)  #'./outputs/out1/overall_stats.csv'
df=pd.read_csv('./outputs/out1/overall_stats.csv')
rewards=np.array(df[' Store Util%'])  #reward
overflows=np.array(df[' Step'])  #overflow_count

In [None]:
#histogram
x=np.array(rewards)
plt.hist(x, density=True, bins=30)  #`density=False` would make counts
plt.ylabel('Probability')
plt.xlabel('Rewards');

In [None]:
#base_reward,max_reward(+index)
max_rewards=max(rewards)
print('Maximum reward: %f'%(max_rewards))
print('Maximum reward index: ',np.argmax(rewards))
#print('Base reward: ',env.base_reward)
count=0
for i in range(len(rewards)):
    if rewards[i]>0:
        count+=1
#        print('hi',i)
print('count: ',count,count/len(rewards))

In [None]:
#plots
plt.plot(rewards[0:400])
#plt.plot(overflows)
#plt.plot(overflows)
plt.xlabel('Steps')
plt.ylabel('Reward')
plt.title('Reward vs Steps for constant_p=100, constant_n=20, prop_neg=True')
plt.show()

In [None]:
#rough
max(df[' Store Util%'])
df[' Step']

In [None]:
#resent50
import torch
model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True)
# or any of these variants
# model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet34', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet101', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet152', pretrained=True)
model.eval()