<a href="https://colab.research.google.com/github/yejoonlee/NCPW/blob/main/NCPW_DL/create_agent/DQN_100_reward_improved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import pandas as pd
import matplotlib as plt

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [None]:
b_skills = [
    {'idx':0,
     'name':'kick',
     'damage':25,
     'cool':2,
     'hit_rate':0.9
    },
    {'idx':1,
     'name':'punch',
     'damage':10,
     'cool':1,
     'hit_rate':1.0
    }
]

p_skills = [
    {'idx':0,
     'name':'kick',
     'damage':25,
     'cool':2,
     'hit_rate':0.9
    },
    {'idx':1,
     'name':'punch',
     'damage':10,
     'cool':1,
     'hit_rate':1.0
    },
    {'idx':2,
     'name':'heal',
     'damage':0,
     'cool':5,
     'hit_rate':1.0
    },
    {'idx':3,
     'name':'jump',
     'damage':0,
     'cool':1,
     'hit_rate':1.0
    },
]

In [None]:
from types import prepare_class
def to_tensor(np_array: np.array, size=None) -> torch.tensor:
    torch_tensor = torch.from_numpy(np_array).float()
    if size is not None:
        torch_tensor = torch_tensor.view(size)
    return torch_tensor


def to_numpy(torch_tensor: torch.tensor) -> np.array:
    return torch_tensor.cpu().detach().numpy()


class EMAMeter:

    def __init__(self,
                 alpha: float = 0.5):
        self.s = None
        self.alpha = alpha

    def update(self, y):
        if self.s is None:
            self.s = y
        else:
            self.s = self.alpha * y + (1 - self.alpha) * self.s

from random import sample
class ReplayMemory:
    def __init__(self, max_size):
        # deque object that we've used for 'episodic_memory' is not suitable for random sampling
        # here, we instead use a fix-size array to implement 'buffer'
        self.buffer = [None] * max_size
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def push(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = sample(range(self.size), batch_size)
        return [self.buffer[index] for index in indices]

    def __len__(self):
        return self.size

def soft_update(net, net_target, tau):
    for param_target, param in zip(net_target.parameters(), net.parameters()):
        param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)

def prepare_training_inputs(sampled_exps, device='cpu'):
    states = []
    actions = []
    rewards = []
    next_states = []
    dones = []
    for sampled_exp in sampled_exps:
        states.append(sampled_exp[0])
        actions.append(sampled_exp[1])
        rewards.append(sampled_exp[2])
        next_states.append(sampled_exp[3])
        dones.append(sampled_exp[4])

    states = torch.cat(states, dim=0).float().to(device)
    actions = torch.cat(actions, dim=0).to(device)
    rewards = torch.cat(rewards, dim=0).float().to(device)
    next_states = torch.cat(next_states, dim=0).float().to(device)
    dones = torch.cat(dones, dim=0).float().to(device)
    return states, actions, rewards, next_states, dones

class DQN(nn.Module):

    def __init__(self,
                 state_dim: int,
                 action_dim: int,
                 qnet: nn.Module,
                 qnet_target: nn.Module,
                 lr: float,
                 gamma: float,
                 epsilon: float):
        """
        :param state_dim: input state dimension
        :param action_dim: action dimension
        :param qnet: main q network
        :param qnet_target: target q network
        :param lr: learning rate
        :param gamma: discount factor of MDP
        :param epsilon: E-greedy factor
        """

        super(DQN, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.qnet = qnet
        self.lr = lr
        self.gamma = gamma
        self.opt = torch.optim.Adam(params=self.qnet.parameters(), lr=lr)
        self.register_buffer('epsilon', torch.ones(1) * epsilon)

        # target network related
        qnet_target.load_state_dict(qnet.state_dict())
        self.qnet_target = qnet_target
        self.criteria = nn.SmoothL1Loss()

    def get_action(self, state):
        qs = self.qnet(state)
        prob = np.random.uniform(0.0, 1.0, 1)
        if torch.from_numpy(prob).float() <= self.epsilon:  # random
            action = np.random.choice(range(self.action_dim))
        else:  # greedy
            action = qs.argmax(dim=-1)
        return int(action)

    def update(self, state, action, reward, next_state, done):
        s, a, r, ns = state, action, reward, next_state

        # compute Q-Learning target with 'target network'
        with torch.no_grad():
            q_max, _ = self.qnet_target(ns).max(dim=-1, keepdims=True)
            q_target = r + self.gamma * q_max * (1 - done)

        q_val = self.qnet(s).gather(1, a)
        loss = self.criteria(q_val, q_target)

        self.opt.zero_grad()
        loss.backward()
        self.opt.step()

class MLP(nn.Module):

    def __init__(self,
                 input_dim: int,
                 output_dim: int,
                 num_neurons: list = [64, 32],
                 hidden_act: str = 'ReLU',
                 out_act: str = 'Identity'):
        super(MLP, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_neurons = num_neurons
        self.hidden_act = getattr(nn, hidden_act)()
        self.out_act = getattr(nn, out_act)()

        input_dims = [input_dim] + num_neurons
        output_dims = num_neurons + [output_dim]

        self.layers = nn.ModuleList()
        for i, (in_dim, out_dim) in enumerate(zip(input_dims, output_dims)):
            is_last = True if i == len(input_dims) - 1 else False
            self.layers.append(nn.Linear(in_dim, out_dim))
            if is_last:
                self.layers.append(self.out_act)
            else:
                self.layers.append(self.hidden_act)

    def forward(self, xs):
        for layer in self.layers:
            xs = layer(xs)
        return xs

class boss_raid_simulater():
    def __init__(self, b_skills, p_skills, reward_rate):

        self.reward_rate = reward_rate

        self.full_hp = 100
        
        self.num_bs = len(b_skills)
        self.num_ps = len(p_skills)
        
        self.state_dict = dict()
        self.state_dict['b_hp'] = self.full_hp
        for i in range(len(b_skills)):
            self.state_dict[f'b_cool_{i}'] = 0
        self.b_skills = b_skills
        
        self.state_dict['p_hp'] = self.full_hp
        for i in range(len(p_skills)):
            self.state_dict[f'p_cool_{i}'] = 0
        self.p_skills = p_skills

        self.state_pre_dict = self.state_dict.copy()
        self.state = np.array(list(self.state_dict.values()))
        
        self.boss_action = []
        self.done = 0
        self.len_t = 0
        self.len_st = 1
        
        self.reward = 0
        
    def observe(self):
        return self.state
    
    def step(self, p_action):
        self.len_t += 1
        len_t = self.len_t
        p_skill = self.p_skills[int(p_action)]
        
        b_action = int(np.random.uniform(0.0, 1.0, 1).round())
        self.boss_action.append(b_action)
        b_skill = self.b_skills[b_action]
        
#         self.battle(p_skill, b_skill)
        self.battle_with_cool(p_skill, b_skill)

        if self.len_t > 20:
          self.len_st += 0.2

        self.reward = - (self.state_dict['b_hp']/self.len_st)*self.reward_rate[0]\
                      + (self.state_dict['p_hp']/self.len_st)*self.reward_rate[1]
        
        if self.state_dict['b_hp'] <= 0 or self.state_dict['p_hp'] <= 0:
            self.done = 1
            self.reward -= self.len_t*self.reward_rate[2]
            
            if self.state_dict['b_hp'] <= 0:
                self.reward += self.full_hp*max([self.reward_rate[0],self.reward_rate[1]])*self.reward_rate[3]
            elif self.state_dict['p_hp'] <= 0:
                self.reward -= self.full_hp*max([self.reward_rate[0],self.reward_rate[1]])*self.reward_rate[3]
                
        self.state_pre_dict = self.state_dict.copy()
        ns = self.observe()
        r = self.reward
        done = self.done
                
        return ns, r, done
    
    def battle_with_cool(self, p_skill, b_skill):
        p_d = p_skill['damage']
        b_d = b_skill['damage']
        
        p_c = p_skill['cool']
        b_c = b_skill['cool']
        
        p_i = p_skill['idx']
        b_i = b_skill['idx']
        
        p_n = p_skill['name']
        
        p_sc = self.state_dict[f'p_cool_{p_i}']
        b_sc = self.state_dict[f'b_cool_{b_i}']
        
        # print(f"b:{b_skill['name']} / p:{p_skill['name']}")
        
        if b_sc == 0:
            if p_n == 'jump':
                if np.random.uniform(0.0,1.0,1) < 0.7:
                    # print('miss')
                    pass
                else: self.state_dict['p_hp'] -= b_d
            else: self.state_dict['p_hp'] -= b_d
            self.state_dict[f'b_cool_{b_i}'] = b_c + 1
        for i in range(self.num_bs):
            self.state_dict[f'b_cool_{i}'] = max([self.state_dict[f'b_cool_{i}'] -1, 0])
            
        if p_sc == 0:
            if p_n == 'heal':
                # print('healed')
                self.state_dict['p_hp'] += 20
            else: self.state_dict['b_hp'] -= p_d
            self.state_dict[f'p_cool_{p_i}'] = p_c + 1
        for i in range(self.num_ps):
            self.state_dict[f'p_cool_{i}'] = max([self.state_dict[f'p_cool_{i}'] -1, 0])
            
        self.state = np.array(list(self.state_dict.values()))
            
    # def battle(self, p_skill, b_skill):
    #     self.state['b_hp'] -= p_skill['damage']
    #     self.state['p_hp'] -= b_skill['damage']

100퍼센트 승률을 보이는 에이전트 학습 코드.
해당 에이전트를 통해서 수행되는 에피소드들의 기록을 log_normal로 정의하여 다른 방식으로 학습된 에이전트들의 행동방식과 비교군으로 사용

In [None]:
lr = 1e-4 * 5
batch_size = 256
gamma = 0.88
memory_size = 50000
total_eps = 2000
eps_max = 0.08
eps_min = 0.01
sampling_only_until = 2000
target_update_interval = 10

def get_agent(reward_rate, print_all = False, total_eps = 2000):
  env = boss_raid_simulater(b_skills, p_skills, reward_rate)
  s_dim = env.state.shape[0]
  a_dim = len(p_skills)

  qnet = MLP(s_dim, a_dim, num_neurons=[64,64])
  qnet_target = MLP(s_dim, a_dim, num_neurons=[64,64])

  # initialize target network same as the main network.
  qnet_target.load_state_dict(qnet.state_dict())
  agent = DQN(s_dim, 1, qnet=qnet, qnet_target=qnet_target, lr=lr, gamma=gamma, epsilon=1.0)
  memory = ReplayMemory(memory_size)

  print_every = 100

  for n_epi in range(total_eps):
      # epsilon scheduling
      # slowly decaying_epsilon
      epsilon = max(eps_min, eps_max - eps_min * (n_epi / 200))
      agent.epsilon = torch.tensor(epsilon)
      env = boss_raid_simulater(b_skills, p_skills, reward_rate)
      s = env.observe()
      cum_r = 0

      while True:
          s = to_tensor(s, size=(1, s_dim))
          a = agent.get_action(s)
          ns, r, done = env.step(a)

          experience = (s,
                        torch.tensor(a).view(1, 1),
                        torch.tensor(r / 100.0).view(1, 1),
                        torch.tensor(ns).view(1, s_dim),
                        torch.tensor(done).view(1, 1))
          memory.push(experience)

          s = ns
          cum_r += r
          if done:
              break

      if len(memory) >= sampling_only_until:
          # train agent
          sampled_exps = memory.sample(batch_size)
          sampled_exps = prepare_training_inputs(sampled_exps)
          agent.update(*sampled_exps)

      if n_epi % target_update_interval == 0:
          qnet_target.load_state_dict(qnet.state_dict())
      
      if n_epi % print_every == 0:
          msg = (n_epi, cum_r, epsilon)
          if print_all:
            print("Episode : {:4.0f} | Cumulative Reward : {:4.0f} | Epsilon : {:.3f}".format(*msg))

  return agent

def get_play_log(agent, reward_rate, print_all, n_eps = 100, print_every = 500):

  env = boss_raid_simulater(b_skills, p_skills, reward_rate)
  sum_wr = 0

  for _ in range(10):
    log = {'states': [],
            'p_actions': [],
            'b_actions': [],
            'rewards': []}

    i = 0
    for ep in range(n_eps):
        env = boss_raid_simulater(b_skills, p_skills, reward_rate)
        s_dim = env.state.shape[0]
        a_dim = len(p_skills)
        s = env.observe()
        cum_r = 0

        states = []
        actions = []
        rewards = []

        while True:
            s = to_tensor(s, size=(1, s_dim))
            a = agent.get_action(s)
            ns, r, done = env.step(a)

            states.append(s)
            actions.append(a)
            rewards.append(r)

            s = ns
            cum_r += r
            if done:
                if s[0] < s[3]:
                  i += 1
                break
        
        log['states'].append(states)
        log['p_actions'].append(actions)
        log['rewards'].append(rewards)
        log['b_actions'].append(env.boss_action)

    if print_all:
      print(f'num wins: {i}  /  win rate: {i/100}')
    sum_wr += i/100

  if print_all:
    print(f'win rate avg: {sum_wr/10}')

  return log

def info_from_log(log):
  lens = 0
  for s in log['states']:
    lens += len(s)
  
  print(f"avg len of episode:   {lens/100}")

  sum_phps = 0
  for epi in log['states']:
    sum_php = 0
    for state in epi:
      sum_php += state[0][3]
    sum_phps += sum_php/len(epi)

  print(f"avg HP of player in episode:   {sum_phps/100}")

  sum_bhps = 0
  for epi in log['states']:
    sum_bhp = 0
    for state in epi:
      sum_bhp += state[0][0]
    sum_bhps += sum_bhp/len(epi)

  print(f"avg HP of boss in episode:   {sum_bhps/100}")

  rate_dict = dict()
  rate_dict['len_epi'] = lens/100
  rate_dict['avg_bhp'] = sum_bhps/100
  rate_dict['avg_php'] = sum_phps/100
  rate_dict['kick'] = 0
  rate_dict['punch'] = 0
  rate_dict['heal'] = 0
  rate_dict['jump'] = 0

  for i in range(100):
    actions = log['p_actions'][i]
    rate_dict['kick'] += actions.count(0)/len(actions)
    rate_dict['punch'] += actions.count(1)/len(actions)
    rate_dict['heal'] += actions.count(2)/len(actions)
    rate_dict['jump'] += actions.count(3)/len(actions)

  print(rate_dict)
  return rate_dict

def create_agent_log(reward_rate, print_all = False):
  if print_all:
    print("============= AGENT TRAINING ==============\n")
  agent = get_agent(reward_rate, print_all)
  if print_all:
    print("\n============= PALYING SOME ==============\n")
  log = get_play_log(agent, reward_rate, print_all)
  print("\n============= GET INFO FROM PALYING LOG ==============\n")
  info = info_from_log(log)

  return agent, log, info

# SAFE AGENT
자신의 안정성을 더 우선하여 플레이하는 스타일의 에이전트를 학습시키는 것이 목적
성능을 그대로 가져가기 위해 전체적인 하이퍼 파라미터들을 되도록 변경하지 않고 reward만 변경하여 구현하고싶지만 맘대로 되진 않겠지

In [None]:
reward_rate_safe = [1,1.5,0.5]

agent_safe, log_safe, info_safe = create_agent_log(reward_rate_safe)

IndexError: ignored

# DPS AGENT
생각보다 유지형 플레이어가 잘 나와서 딜러 에이전트를 만들어보자

In [None]:
reward_rate_dps = [1.5,1,1]

agent_dps, log_dps, info_dps = create_agent_log(reward_rate_dps)


Episode :    0 | Cumulative Reward : -976 | Epsilon : 0.080
Episode :  100 | Cumulative Reward : -975 | Epsilon : 0.075
Episode :  200 | Cumulative Reward : -118 | Epsilon : 0.070
Episode :  300 | Cumulative Reward : -202 | Epsilon : 0.065
Episode :  400 | Cumulative Reward :  292 | Epsilon : 0.060
Episode :  500 | Cumulative Reward :  227 | Epsilon : 0.055
Episode :  600 | Cumulative Reward :  406 | Epsilon : 0.050
Episode :  700 | Cumulative Reward :  362 | Epsilon : 0.045
Episode :  800 | Cumulative Reward :  478 | Epsilon : 0.040
Episode :  900 | Cumulative Reward :  456 | Epsilon : 0.035
Episode : 1000 | Cumulative Reward :  448 | Epsilon : 0.030
Episode : 1100 | Cumulative Reward :  348 | Epsilon : 0.025
Episode : 1200 | Cumulative Reward :  422 | Epsilon : 0.020
Episode : 1300 | Cumulative Reward :  352 | Epsilon : 0.015
Episode : 1400 | Cumulative Reward :  392 | Epsilon : 0.010
Episode : 1500 | Cumulative Reward :  755 | Epsilon : 0.010
Episode : 1600 | Cumulative Reward :  3

# 각 reward rate에 따라 생성된 플레이 결과 비교

위의 두 시험과정 이후에 에피소드를 길게 끌면 보상을 많이 얻어 이를 통한 꼼수를 Agent가 학습하는 경향을 없애기 위해 reward_rate의 요소를 4개로 늘리고 reward 조정 방식을 수정함
  
밸런스형: reward_rate_normal = [1, 1, 2, 2]  
방어형: reward_rate_safe = [1, 1.5, 2, 2]  
공격형: reward_rate_dps = [2, 1.5, 2, 2]  
  
  
위의 조건으로 학습시켰을 때 아래와 같은 결과가 나왔다.  
avg_bhp값이 공격형 Agent에서 가장 작고  
avg_php값이 방어형 Agent에서 가장 큰 결과를 보면  

reward가 조정된 효과대로 결과가 나온 것으로 보인다.

In [None]:
print(info_normal)
print(info_safe)
print(info_dps)

{'len_epi': 12.15, 'avg_bhp': tensor(51.5978), 'avg_php': tensor(76.3833), 'kick': 49.28781339433514, 'punch': 0.0, 'heal': 8.928884038666638, 'jump': 41.78330256699822}
{'len_epi': 20.78, 'avg_bhp': tensor(59.3858), 'avg_php': tensor(95.0629), 'kick': 23.364255140982767, 'punch': 0.4050171526586621, 'heal': 20.249031941546505, 'jump': 55.98169576481206}
{'len_epi': 11.77, 'avg_bhp': tensor(44.9711), 'avg_php': tensor(75.7785), 'kick': 36.26567859885735, 'punch': 24.899297619218192, 'heal': 9.620030474217101, 'jump': 29.214993307707342}


In [None]:
print_all = True

# reward_rate_normal = [1, 1, 2, 2]
# agent_normal, log_normal, info_normal = create_agent_log(reward_rate_normal, print_all)

reward_rate_safe = [1, 3, 2, 2]
agent_safe, log_safe, info_safe = create_agent_log(reward_rate_safe, print_all)

# reward_rate_dps = [2, 1.5, 2, 2]
# agent_dps, log_dps, info_dps = create_agent_log(reward_rate_dps, print_all)

# reward_rate_speed = [0.1,0.1,10]
# agent_speed, log_speed, info_speed = create_agent_log(reward_rate_speed)


Episode :    0 | Cumulative Reward :  -13 | Epsilon : 0.080
Episode :  100 | Cumulative Reward :  -60 | Epsilon : 0.075
Episode :  200 | Cumulative Reward : -237 | Epsilon : 0.070
Episode :  300 | Cumulative Reward : 1625 | Epsilon : 0.065
Episode :  400 | Cumulative Reward : 1510 | Epsilon : 0.060
Episode :  500 | Cumulative Reward : 2551 | Epsilon : 0.055
Episode :  600 | Cumulative Reward : 2413 | Epsilon : 0.050
Episode :  700 | Cumulative Reward : 1837 | Epsilon : 0.045
Episode :  800 | Cumulative Reward : 1828 | Epsilon : 0.040
Episode :  900 | Cumulative Reward : 2638 | Epsilon : 0.035
Episode : 1000 | Cumulative Reward : 2353 | Epsilon : 0.030
Episode : 1100 | Cumulative Reward : 1843 | Epsilon : 0.025
Episode : 1200 | Cumulative Reward : 2173 | Epsilon : 0.020
Episode : 1300 | Cumulative Reward : 2413 | Epsilon : 0.015
Episode : 1400 | Cumulative Reward : 2308 | Epsilon : 0.010
Episode : 1500 | Cumulative Reward : 1858 | Epsilon : 0.010
Episode : 1600 | Cumulative Reward : 21

# AGENT 저장 및 불러오기

In [None]:
from google.colab import drive

drive.mount("/contents/")
# import sys; sys.path.append('/contents/MyDrive/FASTCAMPUS/강화학습/ReinforcementLearningAtoZ') # add project root to the python path

Mounted at /contents/


In [None]:
!ls /contents/MyDrive/Colab_Notebooks/projects/NCPW/RL

 A2C_BASIC_50.ipynb  'DQN_agent_[1_1_03_1].ptb'     REINFORCE_BASIC_70.ipynb
 DDPG.ipynb	     'DQN_agent_[1_14_03_09].ptb'   REINFORCE_SAFE_70.ipynb
 DQN_100.ipynb	     'DQN_agent_[14_1_03_09].ptb'


In [None]:
# torch.save(agent_normal.state_dict(), '/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[10_10_20_20].ptb')
torch.save(agent_safe.state_dict(), '/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/agent/DQN_agent_[10_30_20_20]_test.ptb')
# torch.save(agent_dps.state_dict(), '/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[20_15_20_20].ptb')

In [None]:
def get_agent_to_load(reward_rate, print_all=False, memory_size=1, batch_size=1, ):
    env = boss_raid_simulater(b_skills, p_skills, reward_rate)
    s_dim = env.state.shape[0]
    a_dim = len(p_skills)

    qnet = MLP(s_dim, a_dim, num_neurons=[64,64])
    qnet_target = MLP(s_dim, a_dim, num_neurons=[64,64])

    # initialize target network same as the main network.
    qnet_target.load_state_dict(qnet.state_dict())
    agent = DQN(s_dim, 1, qnet=qnet, qnet_target=qnet_target, lr=1e-4 * 5, gamma=0.88, epsilon=1.0)
    memory = ReplayMemory(memory_size)

    # epsilon scheduling
    # slowly decaying_epsilon
    epsilon = 0.8
    agent.epsilon = torch.tensor(epsilon)
    env = boss_raid_simulater(b_skills, p_skills, reward_rate)
    s = env.observe()
    cum_r = 0

    while True:
        s = to_tensor(s, size=(1, s_dim))
        a = agent.get_action(s)
        ns, r, done = env.step(a)

        experience = (s,
                      torch.tensor(a).view(1, 1),
                      torch.tensor(r / 100.0).view(1, 1),
                      torch.tensor(ns).view(1, s_dim),
                      torch.tensor(done).view(1, 1))
        memory.push(experience)

        s = ns
        cum_r += r
        if done:
            break

    # train agent
    sampled_exps = memory.sample(batch_size)
    sampled_exps = prepare_training_inputs(sampled_exps)
    agent.update(*sampled_exps)

    qnet_target.load_state_dict(qnet.state_dict())

    return agent


def load_agent(agent_name):
    reward_rate = [0, 0, 0, 0]
    agent_load = get_agent_to_load(reward_rate)
    agent_load.load_state_dict(torch.load(agent_name))

    return agent_load

In [None]:
def get_single_play_log(agent):
  reward_rate = [0,0,0,0]
  sum_wr = 0

  log = {'states': [],
          'p_actions': [],
          'b_actions': [],}

  env = boss_raid_simulater(b_skills, p_skills, reward_rate)
  s_dim = env.state.shape[0]
  a_dim = len(p_skills)
  s = env.observe()
  cum_r = 0

  states = []
  actions = []
  rewards = []

  while True:
      s = to_tensor(s, size=(1, s_dim))
      a = agent.get_action(s)
      ns, r, done = env.step(a)

      states.append(list(to_numpy(s)[0]))
      actions.append(a)
      rewards.append(r)

      s = ns
      cum_r += r
      if done:
          break
  
  log['states'] = str(states)
  log['p_actions'] = str(actions)
  log['b_actions'] = str(env.boss_action)

  return log

In [None]:
agent_load = load_agent('/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[20_15_20_20].ptb')

In [None]:
get_single_play_log(agent_load)

{'b_actions': '[0, 1, 1, 1, 1, 1, 0, 1, 0, 1]',
 'p_actions': '[0, 2, 1, 0, 3, 3, 0, 1, 0, 0]',
 'states': '[[100.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0], [75.0, 2.0, 0.0, 75.0, 2.0, 0.0, 0.0, 0.0], [75.0, 1.0, 1.0, 85.0, 1.0, 0.0, 5.0, 0.0], [65.0, 0.0, 0.0, 85.0, 0.0, 1.0, 4.0, 0.0], [40.0, 0.0, 1.0, 75.0, 2.0, 0.0, 3.0, 0.0], [40.0, 0.0, 0.0, 75.0, 1.0, 0.0, 2.0, 1.0], [40.0, 0.0, 1.0, 75.0, 0.0, 0.0, 1.0, 0.0], [15.0, 2.0, 0.0, 50.0, 2.0, 0.0, 0.0, 0.0], [5.0, 1.0, 1.0, 40.0, 1.0, 1.0, 0.0, 0.0], [5.0, 0.0, 0.0, 40.0, 0.0, 0.0, 0.0, 0.0]]'}

In [None]:
info_from_log(get_play_log(agent_load,[0,0,0,0],True))

num wins: 91  /  win rate: 0.91
num wins: 97  /  win rate: 0.97
num wins: 91  /  win rate: 0.91
num wins: 95  /  win rate: 0.95
num wins: 96  /  win rate: 0.96
num wins: 97  /  win rate: 0.97
num wins: 93  /  win rate: 0.93
num wins: 96  /  win rate: 0.96
num wins: 91  /  win rate: 0.91
num wins: 91  /  win rate: 0.91
win rate avg: 0.938
avg len of episode:   73.1
avg HP of player in episode:   75.17560577392578
avg HP of boss in episode:   26.963857650756836
{'len_epi': 73.1, 'avg_bhp': tensor(26.9639), 'avg_php': tensor(75.1756), 'kick': 8.264146846635132, 'punch': 5.455127759223847, 'heal': 21.92095764838223, 'jump': 64.35976774575873}


{'avg_bhp': tensor(26.9639),
 'avg_php': tensor(75.1756),
 'heal': 21.92095764838223,
 'jump': 64.35976774575873,
 'kick': 8.264146846635132,
 'len_epi': 73.1,
 'punch': 5.455127759223847}

In [None]:
agent_load_a = load_agent('/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[10_10_20_20].ptb')
agent_load_b = load_agent('/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[10_15_20_20].ptb')
agent_load_c = load_agent('/contents/MyDrive/Colab_Notebooks/projects/NCPW/RL/DQN_agent_[20_15_20_20].ptb')

In [None]:
a_result = info_from_log(get_play_log(agent_load_a,[0,0,0,0],False))
b_result = info_from_log(get_play_log(agent_load_b,[0,0,0,0],False))
c_result = info_from_log(get_play_log(agent_load_c,[0,0,0,0],False))

avg len of episode:   11.92
avg HP of player in episode:   74.11561584472656
avg HP of boss in episode:   52.157413482666016
{'len_epi': 11.92, 'avg_bhp': tensor(52.1574), 'avg_php': tensor(74.1156), 'kick': 53.438247899761066, 'punch': 0.0, 'heal': 9.452469935035712, 'jump': 37.109282165203204}
avg len of episode:   19.44
avg HP of player in episode:   93.72418212890625
avg HP of boss in episode:   60.840614318847656
{'len_epi': 19.44, 'avg_bhp': tensor(60.8406), 'avg_php': tensor(93.7242), 'kick': 24.8147592002096, 'punch': 0.8808847414762441, 'heal': 20.112249190335852, 'jump': 54.19210686797827}
avg len of episode:   12.04
avg HP of player in episode:   76.85961151123047
avg HP of boss in episode:   45.200618743896484
{'len_epi': 12.04, 'avg_bhp': tensor(45.2006), 'avg_php': tensor(76.8596), 'kick': 34.37449421997812, 'punch': 25.90424736754996, 'heal': 9.575498284638137, 'jump': 30.145760127833753}
