# Reinforcement Learning

### Imports:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchaudio
from os import path, walk
import torch.nn as nn
from IPython.display import Audio, display
from tqdm import tqdm

### Hyperparameters

In [None]:
fs = 8000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

### Data collecting function

In [None]:
def initial_states_to_list(random_sampling_file):
    initial_states = list()
    f = open(random_sampling_file, "r")
    for line in f:
        brir, file = line.split(" ")
        initial_states.append((brir, file.removesuffix("\n")))
    f.close()
    return initial_states

## Deep Q-network

In [None]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_angles):
        super(DQN, self).__init__()
        self.n_hidden = 1000
        self.gru1 = nn.GRU(n_observations, 256, 1, batch_first=True, bidirectional=False)
        self.gru2 = nn.GRU(256, 128, 1, batch_first=True, bidirectional=False)
        self.gru3 = nn.GRU(128, 64, 1, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(64*2, n_angles)
        self.dropout20 = nn.Dropout(p=0.2)
        self.dropout50 = nn.Dropout(p=0.5)

    def forward(self, x):
        x, _ = self.gru1(x)
        x = self.dropout20(x)
        x, _ = self.gru2(x)
        x = self.dropout20(x)
        x, _ = self.gru3(x)
        x = self.dropout50(x)
        x = torch.cat((x[:,0,:],x[:,1,:]), dim=1)
        x = self.fc(x)
        return x

## Memory buffer

In [None]:
class MemoryBuffer():
    def __init__(self, buffer_length, batch_size):
        self.buffer_length = buffer_length
        self.idx = 0
        self.s_buffer = torch.zeros((self.buffer_length, 2, 4000))
        self.a_buffer = torch.zeros((self.buffer_length, 1), dtype=torch.int)
        self.s1_buffer = torch.zeros((self.buffer_length, 2, 4000))
        self.r_buffer = torch.zeros((self.buffer_length, 1))
        self.t_buffer = torch.zeros((self.buffer_length, 1), dtype=torch.bool)
        self.batch_size = batch_size

    def add_into_buffer(self, s, a, s_prime, r, t):
        """
        Function that adds a <s,a,s',r> tuple into the buffer.
        s: state
        a: action
        s_prime: new state
        r: reward
        """
        self.s_buffer[self.idx % self.buffer_length] = s
        self.a_buffer[self.idx % self.buffer_length] = torch.tensor([a], dtype=torch.int)
        self.s1_buffer[self.idx % self.buffer_length] = s_prime
        self.r_buffer[self.idx % self.buffer_length] = torch.tensor([r])
        self.t_buffer[self.idx % self.buffer_length] = torch.tensor([t], dtype=torch.bool)
        if self.idx < self.buffer_length:
            self.idx += 1
        else:
            self.idx += 2


    def sample_from_buffer(self):
        """
        Function that returns random samples from the buffer.
        """
        indices = torch.randperm(min(self.idx, self.batch_size))[:self.batch_size]
        return self.s_buffer[indices], self.a_buffer[indices], \
        self.s1_buffer[indices], self.r_buffer[indices], self.t_buffer[indices]


class BalancedMemoryBuffer():
    def __init__(self, batch_size, num_classes, az_angles, el_angles):
        self.device = torch.device("cuda")

        self.az_angles = az_angles
        self.el_angles = el_angles

        self.buffer_length = 100000
        self.start_buffer_length = 5000
        self.class_buffer_length = 1000
        self.num_classes = 65
        self.batch_size = batch_size

        self.start_s = torch.empty((self.start_buffer_length, 2, 4000), dtype=torch.float).to(self.device)
        self.start_a = torch.empty((self.start_buffer_length, 1), dtype=torch.int).to(self.device)
        self.start_s1 = torch.empty((self.start_buffer_length, 2, 4000), dtype=torch.float).to(self.device)
        self.start_r = torch.empty((self.start_buffer_length, 1), dtype=torch.float).to(self.device)
        self.start_t = torch.empty((self.start_buffer_length, 1), dtype=torch.bool).to(self.device)

        self.class_s = torch.empty((num_classes, self.class_buffer_length, 2, 4000), dtype=torch.float).to(self.device)
        self.class_a = torch.empty((num_classes, self.class_buffer_length, 1), dtype=torch.int).to(self.device)
        self.class_s1 = torch.empty((num_classes, self.class_buffer_length, 2, 4000), dtype=torch.float).to(self.device)
        self.class_r = torch.empty((num_classes, self.class_buffer_length, 1), dtype=torch.float).to(self.device)
        self.class_t = torch.empty((num_classes, self.class_buffer_length, 1), dtype=torch.bool).to(self.device)

        self.idx = 0
        self.class_idxs = torch.zeros(num_classes, dtype=torch.int, device=self.device)


    def find_idx(self, az, el):
        az_angle_idx = self.az_angles.index(az)
        el_angle_idx = self.el_angles.index(el)

        return az_angle_idx + el_angle_idx*len(self.az_angles)

    def add_into_buffer(self, s, a, s_prime, r, t, az, el):
        """
        Function that adds a <s,a,s',r> tuple into the buffer for a specific class.
        s: state
        a: action
        s_prime: new state
        r: reward
        t: terminal state
        az: azimuth angle
        el: elevation angle
        """
        if self.idx < self.start_buffer_length:
            self.start_s[self.idx] = s
            self.start_a[self.idx] = torch.tensor([a], dtype=torch.int)
            self.start_s1[self.idx] = s_prime
            self.start_r[self.idx] = torch.tensor([r])
            self.start_t[self.idx] = torch.tensor([t], dtype=torch.bool)

            self.idx += 1
        else:
            class_idx = self.find_idx(az, el)
            idx_for_class = self.class_idxs[class_idx]

            self.class_s[class_idx, int(idx_for_class)%self.class_buffer_length] = s
            self.class_a[class_idx, int(idx_for_class)%self.class_buffer_length] = torch.tensor([a], dtype=torch.int)
            self.class_s1[class_idx, int(idx_for_class)%self.class_buffer_length] = s_prime
            self.class_r[class_idx, int(idx_for_class)%self.class_buffer_length] = torch.tensor([r])
            self.class_t[class_idx, int(idx_for_class)%self.class_buffer_length] = torch.tensor([t], dtype=torch.bool)

            if idx_for_class % 20 == 0:
                indices = torch.cat(
                    (torch.randperm(
                        min(
                            idx_for_class.item(), self.class_buffer_length)
                        ),
                    torch.arange(
                        start=min(
                            idx_for_class.item(), self.class_buffer_length), end=self.class_buffer_length)
                    )).to(self.device)

                self.class_s[class_idx] = self.class_s[class_idx, indices, :, :]
                self.class_a[class_idx] = self.class_a[class_idx, indices, :]
                self.class_s1[class_idx] = self.class_s1[class_idx, indices, :, :]
                self.class_r[class_idx] = self.class_r[class_idx, indices, :]
                self.class_t[class_idx] = self.class_t[class_idx, indices, :]

            self.class_idxs[class_idx] += 1

    def sample_from_buffer(self):
        """
        Function that returns random samples from the buffer with equal representation from each class.
        """
        if self.idx < self.start_buffer_length:
            indices = torch.randperm(min(self.idx, self.start_buffer_length))[:self.batch_size]
            return self.start_s[indices], self.start_a[indices], \
            self.start_s1[indices], self.start_r[indices], self.start_t[indices]
        else:
            min_class_idx = torch.min(self.class_idxs)
            cut_s = self.class_s[:,:min_class_idx,:,:]
            cut_a = self.class_a[:,:min_class_idx,:]
            cut_s1 = self.class_s1[:,:min_class_idx,:,:]
            cut_r = self.class_r[:,:min_class_idx,:]
            cut_t = self.class_t[:,:min_class_idx,:]

            flat_s = cut_s.reshape(-1, cut_s.size(2), cut_s.size(3))
            flat_a = cut_a.reshape(-1, cut_a.size(2))
            flat_s1 = cut_s1.reshape(-1, cut_s1.size(2), cut_s1.size(3))
            flat_r = cut_r.reshape(-1, cut_r.size(2))
            flat_t = cut_t.reshape(-1, cut_t.size(2))

            total_s = torch.cat((self.start_s, flat_s), axis=0)
            total_a = torch.cat((self.start_a, flat_a), axis=0)
            total_s1 = torch.cat((self.start_s1, flat_s1), axis=0)
            total_r = torch.cat((self.start_r, flat_r), axis=0)
            total_t = torch.cat((self.start_t, flat_t), axis=0)

            # print(total_a)
            max_index = total_s.size(0)

            indices = torch.randperm(max_index)[:self.batch_size]#.to(self.device)
            return total_s[indices], total_a[indices], total_s1[indices], total_r[indices], total_t[indices]



## Environment

In [None]:
class environment():

    def __init__(self, fs, n_windows, length_windows, az_angles, el_angles, random_sampling_file, target_network, policy_network, memory_buffer, n_actions, actions, device):
        self.device=device
        self.fs = fs
        self.n_windows = n_windows
        self.length_windows = length_windows
        self.az_angles = az_angles
        self.el_angles = el_angles
        self.init_states = initial_states_to_list(random_sampling_file)
        self.target_network = target_network.to(device)
        self.policy_network = policy_network.to(device)
        self.memory_buffer = memory_buffer
        self.actions = actions
        self.n_actions = n_actions
        self.batch_size = 1024
        self.criterion = nn.SmoothL1Loss()
        self.lr = 0.00025
        self.optimizer = torch.optim.AdamW(self.policy_network.parameters(), lr=self.lr, amsgrad=True)
        self.gamma = 0.99

    def get_initial_state(self, idx):
        """
        Gives a new sample and initial BRIR/state for a new epoch.
        idx: the index of the new epoch.
        Returns: a tuple with the new BRIR and new sample.
        """
        return self.init_states[idx]

    def open_sample_split(self, sample_name):
        """
        Gets the data of the sample, and splits it into the amount of specified windows.
        sample_name: the name of the sample.
        Returns: a 2D array with the sample split into windows.
        """
        sample, _ = torchaudio.load("samples_10s/"+sample_name, format="flac")
        sample = sample[:,::2]
        return sample.reshape((self.n_windows,-1))

    def get_azel_from_brir(self, brir_name):
        return brir_name.split("_")[16], brir_name.split("_")[18][:3]

    def open_brir(self, brir_name):
        """
        Opens the BRIR given the name of the BRIR.
        Downsamples the BRIR, as BRIR has Fs=16000Hz and the samples Fs=8000Hz
        brir_name: the name of the BRIR.
        Returns: the two channels (for each ear) of the BRIR.
        """
        brir, brir_fs = torchaudio.load("BRIRs_downsampled/"+brir_name, format="wav")
        return brir[:,::2]

    def convolve_sound(self, window, brir):
        """
        Convolves the BRIR with the window, and cuts result at the size of the window length.
        window: the window of the sample that should be convolved.

        """
        return torchaudio.functional.convolve(window.repeat([2,1]), brir)[:,:self.length_windows]
        # return torchaudio.functional.convolve(windows[0].reshape(1,-1).repeat([2,1]), brir)[:,:self.length_windows]

    def get_Q_values_from_state(self, observation):
        """
        Puts observation into target network and chooses actions
        observation: the sample convolved with the HRTF
        """
        return self.target_network(observation)

    def get_best_actions_from_Q_values(self, Q_values):
        """
        Chooses the best action index given the Q-values for the azimuth and elevation angle rotation.
        Q_values: the Q-values as returned by the target network.
        Returns: index of the best action
        """
        return torch.argmax(Q_values)
        # return torch.min(torch.floor(torch.add(Q_values, 1)*(self.n_directions/2)),torch.tensor(self.n_directions-1))

    def sample_action_epsilon_greedily(self, best_action, epsilon):
        """
        Samples an action epsilon-greedily.
        best_actions: the index of the best actions.
        epsilon: the greediness parameter.
        Returns: the actions [az,el].
        """
        p_best_action = 1 - epsilon + epsilon/self.n_actions
        p_action = epsilon/self.n_actions
        probability_table = np.full(self.n_actions, p_action)
        probability_table[best_action] = p_best_action
        index = np.random.choice(np.arange(self.n_actions), p=probability_table)
        return self.actions[index], index

    def take_action(self, actions, az, el):
        """
        Finds the next azimuth and elevation angle after taking the action.
        actions: a tensor with two values for the azimuth and elevation angles.
        current_brir: the current filename.
        Returns: the new azimuth angle and the new elevation angle, after taking the action, as well as the old angles.
        """

        current_az_angle_idx = self.az_angles.index(az)
        current_el_angle_idx = self.el_angles.index(el)

        az_mov, el_mov = actions

        if current_az_angle_idx + az_mov >= len(self.az_angles):
            az_mov = 0
        if current_az_angle_idx + az_mov < 0:
            az_mov = 0
        if current_el_angle_idx + el_mov >= len(self.el_angles):
            el_mov = 0
        if current_el_angle_idx + el_mov < 0:
            el_mov = 0

        return self.az_angles[current_az_angle_idx + az_mov], self.el_angles[current_el_angle_idx + el_mov]

    def get_reward(self, new_az, new_el, old_az, old_el, brir_name):
        """
        Gives reward based on Euclidean distance to the goal.
        new_az: the new azimuth angle
        new_el: the new elevation angle
        old_az: the old azimuth angle
        old_el: the old elevation angle
        current_brir: the name of the BRIR
        """
        split_brir = brir_name.split("_")
        goal_az = split_brir[12]
        goal_el = split_brir[14]

        if new_az == goal_az and new_el == goal_el:
            return 1

        dists = np.zeros(n_actions)
        for i, action in enumerate(self.actions):
            temp_az, temp_el = self.take_action(action, old_az, old_el)
            dists[i] = np.sqrt((self.az_angles.index(temp_az) - self.az_angles.index(goal_az))**2 + (self.el_angles.index(temp_el) - self.el_angles.index(goal_el))**2)
        optimal_dist = np.min(dists)
        actual_dist = np.sqrt((self.az_angles.index(new_az) - self.az_angles.index(goal_az))**2 + (self.el_angles.index(new_el) - self.el_angles.index(goal_el))**2)
        old_dist = np.sqrt((self.az_angles.index(old_az) - self.az_angles.index(goal_az))**2 + (self.el_angles.index(old_el) - self.el_angles.index(goal_el))**2)

        if optimal_dist == actual_dist:
            return 0.1    # Optimal distance improvement
        elif old_dist > actual_dist:
            return 0      # Suboptimal distance improvement
        else:
            return -0.2   # Distance detoriation

    def next_state(self, current_brir, new_az, new_el):
        """
        Finds the name of the next BRIR.
        current_brir: the name of the current brir
        new_az: the new azimuth angle after the action.
        el_mov: the new elevation angle after the action.
        Returns: the filename of the BRIR.
        """

        brir_name = current_brir.split("_")
        brir_name[16] = new_az
        brir_name[18] = new_el+".wav"

        return "_".join(brir_name)

    def is_terminal(self, current_brir, az, el):
        """
        Returns whether the agent is on a terminal state (when the speaker and agent have the smae angle)
        """
        brir_name = current_brir.split("_")
        return brir_name[12] == az and brir_name[14] == el

    def append_buffer(self, observation, action_indices, new_observation, reward, terminal, az, el):
        """
        Appends an observation into the buffer.
        observation: state s_t
        action_indices: action a_t
        new_observation: new state s_(t+1)
        reward: r
        terminal: boolean t
        az: azimuth angle of new state
        el: elevation angle of new state
        """
        self.memory_buffer.add_into_buffer(observation, action_indices, new_observation, reward, terminal, az, el)

    def train_model(self):
        """
        Trains the policy network.
        returns: the loss as computed with the Huber loss function.
        """
        if self.memory_buffer.idx < 1+self.batch_size:
            return

        s_batch, a_batch, s1_batch, r_batch, t_batch = self.memory_buffer.sample_from_buffer()
        a_batch = a_batch.type(torch.int64)

        s_batch = s_batch.to(device)
        a_batch = a_batch.to(device)
        s1_batch = s1_batch.to(device)
        r_batch = r_batch.to(device)
        t_batch = t_batch.to(device)

        Q_vals = self.policy_network(s_batch).to(device)

        Q_vals = Q_vals.gather(1, a_batch).squeeze(1)

        with torch.no_grad():
            Q1_vals = self.target_network(s1_batch).max(1)[0].detach().to(device)

        target_Q_vals = r_batch.squeeze(1) + self.gamma * Q1_vals * (~t_batch.squeeze(1))

        loss = nn.functional.mse_loss(Q_vals, target_Q_vals)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def distance(self, az, el, brir_name):
        """
        Gives the least amount of actions needed to get to the goal (Chebishev distance)
        az: current azimuth angle
        el: current elevation angle
        brir_name: name of the BRIR file
        returns: the least amount of actions needed to get to the goal
        """
        split_brir = brir_name.split("_")
        goal_az = split_brir[12]
        goal_el = split_brir[14]

        return max(abs(self.az_angles.index(az) - self.az_angles.index(goal_az)), abs(self.el_angles.index(el) - self.el_angles.index(goal_el)))



In [None]:
n_observations = 4000
n_actions = 8
actions = torch.tensor([[-1,-1],[-1,0],[-1,1],[0,-1],[0,1],[1,-1],[1,0],[1,1]])

policy_network = DQN(n_observations, n_actions).to(device)
target_network = DQN(n_observations, n_actions).to(device)

# As weights of network get randomly initialized, copy all weights from target network into policy network
policy_network.load_state_dict(target_network.state_dict())

az_angles = ["270", "285", "300", "315", "330", "345", "000", "015", "030", "045", "060", "075", "090"]
el_angles = ["-45", "-20", "000", "020", "045"]

memory_buffer = BalancedMemoryBuffer(1024, 65, az_angles, el_angles)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/My Drive/samples_10s.zip" -d "/content"
!unzip "/content/drive/My Drive/BRIRs_16000Hz.zip" -d "/content"

In [None]:
# Making the environment
fs = 8000
n_windows = 20
length_windows = 4000
az_angles = ["270", "285", "300", "315", "330", "345", "000", "015", "030", "045", "060", "075", "090"]
el_angles = ["-45", "-20", "000", "020", "045"]
random_sampling_file = "/content/drive/My Drive/random_sampling_file_brir.txt"

env = environment(fs, n_windows, length_windows, az_angles, el_angles, random_sampling_file, target_network, policy_network, memory_buffer, n_actions, actions, device)

In [None]:
episodes = 62400
epsilon = 0.2
episode_length = list() #
all_loss = list() #
tau = 0.005
start_distances = list() #
end_distances = list() #
absolute_distances = list() #
relative_distances = list() #
trajectories = list() #
all_rewards = list()
goals = list()

for ep in tqdm(range(episodes)):
    trajectory = list()
    total_reward = 0

    # Initialise s1:
    brir_name, samp_name = env.get_initial_state(ep)
    goals.append((brir_name.split("_")[12], brir_name.split("_")[14]))
    az, el = env.get_azel_from_brir(brir_name)
    start_distances.append(env.distance(az, el, brir_name))
    trajectory.append([int(az) if int(az) <= 90 else -((-int(az))%360), int(el)])
    windows = env.open_sample_split(samp_name).to(device)
    brir = torch.tensor(env.open_brir(brir_name), dtype=torch.float, device=device)
    observation = env.convolve_sound(windows[0:1], brir)[0:2].to(device)

    # Loop over all other windows:
    for window in range(1, env.n_windows):
        # Select action epsilon-greedily:
        Q_vals = env.get_Q_values_from_state(observation.unsqueeze(0))
        best_action = env.get_best_actions_from_Q_values(Q_vals)
        actions, action_idx = env.sample_action_epsilon_greedily(best_action, epsilon)

        # Execute action and observe reward:
        new_az, new_el = env.take_action(actions, az, el)
        trajectory.append([int(new_az) if int(new_az) <= 90 else -((-int(new_az))%360), int(new_el)])
        reward = env.get_reward(new_az, new_el, az, el, brir_name)
        total_reward += reward

        # Set new action:
        terminal = env.is_terminal(brir_name, new_az, new_el)
        if not terminal:
            brir_name = env.next_state(brir_name, new_az, new_el)
            # print(brir_name)
            brir = torch.tensor(env.open_brir(brir_name), dtype=torch.float, device=device)
            new_observation = env.convolve_sound(windows[window:window+1], brir)[0:2].to(device)
        else:
            new_observation = torch.zeros(2, length_windows).to(device)


        # Store into buffer:
        env.append_buffer(observation, action_idx, new_observation, reward, terminal, new_az, new_el)

        # Go to next state:
        az, el = new_az, new_el
        observation = new_observation

        if terminal:
            episode_length.append(window)
            # DO STUFF FOR STATISTICS
            break
        if window == 19:
            episode_length.append(window+1)

    end_distances.append(env.distance(az, el, brir_name))
    all_rewards.append(total_reward)

    absolute_distances.append(start_distances[-1] - end_distances[-1])
    if end_distances[-1] != 0:
        relative_distances.append(start_distances[-1] / end_distances[-1])
    else:
        relative_distances.append(10)

    # Optimize model
    loss = env.train_model()
    all_loss.append(loss)

    trajectories.append(trajectory)

    if ep%240 == 0:
        # Update model (hard or soft?)
        # Hard update:
        # target_net = env.target_network.state_dict()
        # policy_net = env.policy_network.state_dict()
        # for key in policy_net:
        #     target_net[key] = policy_net[key]*tau + target_net[key] * (1-tau)
        # env.target_network.load_state_dict(target_net)
        env.target_network.load_state_dict(env.policy_network.state_dict())
        print(np.mean(episode_length[-240:]), all_loss[-1], np.mean(all_rewards[-240:]))
        torch.save(env.policy_network.state_dict(), 'policy_network_architecture_hrtf_4.pth')

    # if ep%2400 == 0:
    #     plt.figure()
    #     plt.plot(np.array(trajectory).T[0], np.array(trajectory).T[1])
    #     plt.xlim([-95,95])
    #     plt.ylim([-50,50])
    #     plt.show()


    epsilon -= (epsilon/episodes)

print(np.mean(episode_length[-240:]), all_loss[-1])
torch.save(env.policy_network.state_dict(), 'hrtf_weights.pth')


## Make plots:

### Draw trajectories:

In [None]:
colors = ['red', 'orange', 'yellow', 'green', 'lime', 'cyan', 'magenta', 'pink', 'purple', 'brown']
for i in range(0,62400, 2000):
    plt.figure()
    for j in range(10):
        plt.plot(np.array(trajectories[i+j]).T[0], np.array(trajectories[i+j]).T[1], label="trajectory "+str(j+1)+", episode length "+str(episode_length[i+j])+ ", reward "+str(all_rewards[i+j]).format("%2d"), color=colors[j])
        plt.scatter(np.array(trajectories[i+j]).T[0][0], np.array(trajectories[i+j]).T[1][0], marker='o', color=colors[j])
        plt.scatter(np.array(trajectories[i+j]).T[0][-1], np.array(trajectories[i+j]).T[1][-1], marker='x', color=colors[j])
        # print(episode_length[i+j])
        plt.scatter(np.array(int(goals[i+j][0])), np.array(int(goals[i+j][1])), marker='*', color=colors[j])
    plt.xlim([-95,95])
    plt.ylim([-50,50])
    plt.xticks([90, 75, 60, 45, 30, 15, 0, -15, -30, -45, -60, -75, -90])
    plt.yticks([45, 20, 0, -20, -45])
    plt.grid(True)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.title("Trajectory at episodes "+ str(i)+" to "+ str(i+9))
    plt.show()


### Plot end distances:

In [None]:
ma_abs = list()
for i in range(62200):
    ma_abs.append(np.mean(np.array(end_distances[i:i+200])))
plt.figure()
plt.plot(ma_abs)
plt.title("Moving average of distance to goal at end of episode")
plt.xlabel("Episode")
plt.ylabel("Chebishev distance")
plt.show()

### Plot reward:

In [None]:
ma_rew = list()
for i in range(62200):
    ma_rew.append(np.mean(np.array(all_rewards[i:i+200])))
plt.figure()
plt.plot(ma_rew)
plt.title("Moving average of reward")
plt.xlabel("episode")
plt.ylabel("Reward")
plt.show()

### Plot episode length:

In [None]:
ma = list()
for i in range(62200):
    ma.append(np.mean(np.array(episode_length[i:i+200])))

plt.figure()
plt.plot(ma)
plt.title("Moving average of episode length")
plt.xlabel("Episode")
plt.ylabel("Episode length")
plt.show()

### Plot loss:

In [None]:
plt.figure()
plt.plot(all_loss)
plt.xlabel("Episode")
plt.ylabel("Loss (Huber)")
plt.title("Loss during training")
plt.show()