In [1]:
import pandas as pd
import random

import cv2
import gym
import matplotlib.pyplot as plt
from matplotlib import animation
import numpy as np

import torch
from torch import nn, optim

from IPython.display import HTML

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
device = 'cuda'

In [6]:
# プレイの様子を動画で見てみるための関数
def display_video(frames):
    plt.figure(figsize=(8, 8), dpi=50)
    patch = plt.imshow(frames[0], cmap='gray')
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
    display(HTML(anim.to_jshtml(default_mode='once')))
    plt.close()

In [52]:
env = gym.make("PongNoFrameskip-v4", full_action_space=True)
#env = gym.make("PooyanNoFrameskip-v4", full_action_space=True)
#Atari preprocessing wrapper
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
#Frame stacking
env = gym.wrappers.FrameStack(env, 4)

In [72]:
"""
obs = env.reset()
frames = []
actions = []
total_reward = 0
done = False

while not done:
    frames.append(obs[0])
    action = env.action_space.sample()  # 行動空間から一様ランダムに行動をサンプル
    next_obs, reward, done, _ = env.step(action)
    total_reward += reward
    obs = next_obs
    actions.append(action)

print('Reward: ', total_reward)
display_video(frames)
"""


"\nobs = env.reset()\nframes = []\nactions = []\ntotal_reward = 0\ndone = False\n\nwhile not done:\n    frames.append(obs[0])\n    action = env.action_space.sample()  # 行動空間から一様ランダムに行動をサンプル\n    next_obs, reward, done, _ = env.step(action)\n    total_reward += reward\n    obs = next_obs\n    actions.append(action)\n\nprint('Reward: ', total_reward)\ndisplay_video(frames)\n"

In [57]:
"""
    リプレイバッファの宣言
"""
buffer_size = 100000  #　リプレイバッファに入る経験の最大数
initial_buffer_size = 10000  # 学習を開始する最低限の経験の数
replay_buffer = PrioritizedReplayBuffer(buffer_size)


"""
    ネットワークの宣言
"""
net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
target_net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
target_update_interval = 2000  # 学習安定化のために用いるターゲットネットワークの同期間隔


"""
    オプティマイザとロス関数の宣言
"""
optimizer = optim.Adam(net.parameters(), lr=1e-4)  # オプティマイザはAdam
loss_func = nn.SmoothL1Loss(reduction='none')  # ロスはSmoothL1loss（別名Huber loss）


"""
    Prioritized Experience Replayのためのパラメータβ
"""
beta_begin = 0.4
beta_end = 1.0
beta_decay = 500000
# beta_beginから始めてbeta_endまでbeta_decayかけて線形に増やす
beta_func = lambda step: min(beta_end, beta_begin + (beta_end - beta_begin) * (step / beta_decay))


"""
    探索のためのパラメータε
"""
epsilon_begin = 1.0
epsilon_end = 0.01
epsilon_decay = 50000
# epsilon_beginから始めてepsilon_endまでepsilon_decayかけて線形に減らす
epsilon_func = lambda step: max(epsilon_end, epsilon_begin - (epsilon_begin - epsilon_end) * (step / epsilon_decay))


"""
    その他のハイパーパラメータ
"""
gamma = 0.99  #　割引率
batch_size = 32
n_episodes = 30  # 学習を行うエピソード数

In [58]:
def update(batch_size, beta):
    obs, action, reward, next_obs, done, indices, weights = replay_buffer.sample(batch_size, beta)
    obs, action, reward, next_obs, done, weights \
        = obs.float().to(device), action.to(device), reward.to(device), next_obs.float().to(device), done.to(device), weights.to(device)

    #　ニューラルネットワークによるQ関数の出力から, .gatherで実際に選択した行動に対応する価値を集めてきます.
    q_values = net(obs).gather(1, action.unsqueeze(1)).squeeze(1)

    # 目標値の計算なので勾配を追跡しない
    with torch.no_grad():
        # Double DQN.
        # ① 現在のQ関数でgreedyに行動を選択し,
        greedy_action_next = torch.argmax(net(next_obs), dim=1)
        # ②　対応する価値はターゲットネットワークのものを参照します.
        q_values_next = target_net(next_obs).gather(1, greedy_action_next.unsqueeze(1)).squeeze(1)

    # ベルマン方程式に基づき, 更新先の価値を計算します.
    # (1 - done)をかけているのは, ゲームが終わった後の価値は0とみなすためです.
    target_q_values = reward + gamma * q_values_next * (1 - done)

    # Prioritized Experience Replayのために, ロスに重み付けを行なって更新します.
    optimizer.zero_grad()
    loss = (weights * loss_func(q_values, target_q_values)).mean()
    loss.backward()
    optimizer.step()

    #　TD誤差に基づいて, サンプルされた経験の優先度を更新します.
    replay_buffer.update_priorities(indices, (target_q_values - q_values).abs().detach().cpu().numpy())

    return loss.item()

In [59]:
from src.pruning.slth.edgepopup import modify_module_for_slth
net = modify_module_for_slth(net, 0.3)
stop

#Modules: 16
No modification  <class '__main__.CNNQNetwork'>
No modification conv_layers <class 'torch.nn.modules.container.Sequential'>
Replace nn.Conv2d with SubnetConv: conv_layers.0
No modification conv_layers.1 <class 'torch.nn.modules.activation.ReLU'>
Replace nn.Conv2d with SubnetConv: conv_layers.2
No modification conv_layers.3 <class 'torch.nn.modules.activation.ReLU'>
Replace nn.Conv2d with SubnetConv: conv_layers.4
No modification conv_layers.5 <class 'torch.nn.modules.activation.ReLU'>
No modification fc_state <class 'torch.nn.modules.container.Sequential'>
Replace nn.Linear with SubnetLinear: fc_state.0
No modification fc_state.1 <class 'torch.nn.modules.activation.ReLU'>
Replace nn.Linear with SubnetLinear: fc_state.2
No modification fc_advantage <class 'torch.nn.modules.container.Sequential'>
Replace nn.Linear with SubnetLinear: fc_advantage.0
No modification fc_advantage.1 <class 'torch.nn.modules.activation.ReLU'>
Replace nn.Linear with SubnetLinear: fc_advantage.2


NameError: name 'stop' is not defined

In [60]:
step = 0
rewards = []
for episode in range(n_episodes):
    obs = env.reset()
    done = False
    total_reward = 0

    while not done:
        # ε-greedyで行動を選択
        obs_np = np.array(obs)
        # NumPy配列をPyTorchのテンソルに変換
        obs = torch.tensor(obs_np, dtype=torch.float)
        action = net.act(obs.to(device), epsilon_func(step))
        # 環境中で実際に行動
        next_obs, reward, done, _ = env.step(action)
        total_reward += reward

        next_obs_np = np.array(next_obs)
        # NumPy配列をPyTorchのテンソルに変換
        next_obs = torch.tensor(next_obs_np, dtype=torch.float)

        # リプレイバッファに経験を蓄積
        replay_buffer.push([obs, action, reward, next_obs, done])
        obs = next_obs
        obs = obs.cpu()

        # ネットワークを更新
        if len(replay_buffer) > initial_buffer_size:
            update(batch_size, beta_func(step))

        # ターゲットネットワークを定期的に同期させる
        if (step + 1) % target_update_interval == 0:
            target_net.load_state_dict(net.state_dict())

        step += 1

    print('Episode: {},  Step: {},  Reward: {}'.format(episode + 1, step + 1, total_reward))
    rewards.append(total_reward)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [74]:
net.state_dict()

AttributeError: 'collections.OrderedDict' object has no attribute 'shape'

In [None]:
frames = []
obs = env.reset()
total_reward = 0
done = False

while not done:
    frames.append(env.render(mode='rgb_array')[:, :, 0])
    # テスト時なので, ランダム行動は切る
    obs_np = np.array(obs)
    # NumPy配列をPyTorchのテンソルに変換
    obs = torch.tensor(obs_np, dtype=torch.float).to(device)
    action = net.act(obs.float().to(device), epsilon=0.0)
    next_obs, reward, done, _ = env.step(action)
    total_reward += reward
    obs = next_obs

print('Reward: ', total_reward)
display_video(frames)

Reward:  -20.0
