In [1]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gym
# import gym_pygame

# Hugging Face Hub
# from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
import imageio

In [2]:
env_id = "gym_tetris:tetris_rl"
env = gym.make(env_id, render_mode='none')
eval_env = gym.make(env_id, render_mode='rgb_array')
s_size = env.observation_space.shape
a_size = env.action_space.n

Build Tetris v2.0
Build Tetris v2.0


  logger.warn(
  logger.warn(


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action

print(env.observation_space.shape)

_____OBSERVATION SPACE_____ 

The State Space is:  (1, 29, 10)
Sample observation [[[0.5001674  1.9444618  0.12471315 1.3674952  1.4446001  1.2329413
   0.80900884 1.471096   0.3056068  1.4130958 ]
  [0.92113864 0.16315493 1.1360261  0.06658313 0.21662323 0.3295408
   0.73913896 1.6171783  1.9990059  0.7195149 ]
  [0.55860716 1.9216217  0.5916439  1.224774   1.0354251  0.01833664
   1.2239193  1.3778841  1.7783587  0.52304804]
  [1.9775467  0.6524452  1.1320696  1.3931639  1.577205   1.4921728
   0.11724084 0.12208962 1.9882985  0.22672121]
  [0.80771726 0.8356227  1.7874684  0.28889617 0.88067275 1.5303949
   1.1525766  1.7875986  1.4853914  1.817581  ]
  [1.1156816  0.54448175 1.3786387  1.3334324  0.14255355 1.9888345
   0.40824574 1.9945737  0.64459604 0.97752464]
  [1.3538848  0.09452998 1.2625912  1.9983845  0.6797982  1.8307283
   0.20424323 1.9423357  0.74914616 0.12598668]
  [1.0699419  0.6570036  1.9892824  1.9916095  0.9264592  1.6120689
   1.3292024  0.5010722  0.74973434 1

In [5]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size, h2_size):
        super(Policy, self).__init__()

        # Convolutional Network
        self.conv = nn.Sequential(nn.Conv2d(s_size[0], h_size, 4), nn.ReLU(),
            nn.Conv2d(h_size, h_size, kernel_size=3, stride=1),nn.ReLU())
        # print(self.conv(torch.zeros(1,*s_size,4)).size())
        self.fc = nn.Sequential(
            nn.Linear(int(np.prod(self.conv(torch.zeros(1,*s_size)).size())), h2_size),
            nn.ReLU(),
            nn.Linear(h2_size, a_size)
        )

        # Linear Network
        # self.fc1 = nn.Linear(np.prod(s_size), h_size)
        # self.fc2 = nn.Linear(h_size, h_size)
        # self.fc3 = nn.Linear(h_size, a_size)

        

    def forward(self, x):
        # Convlutional Network
        x = self.conv(x).view(x.size()[0], -1)
        x = self.fc(x)

        # Linear Network
        # x = np.reshape(x,(1,-1))
        # x = torch.relu(self.fc1(x))
        # x = torch.relu(self.fc2(x))
        # x = self.fc3(x)

        # print(np.shape(x))
        # We output the softmax
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [6]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []
    # Line 3 of pseudocode
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Line 4 of pseudocode
        for t in range(max_t):
            action, log_prob = policy.act(state) # Code Here: get the action
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Line 6 of pseudocode: calculate the return
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        ## We compute this starting from the last timestep to the first, to avoid redundant computations

        ## appendleft() function of queues appends to the position 0
        ## We use deque instead of lists to reduce the time complexity

        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft(gamma*disc_return_t + rewards[t]) # Code Here: complete here

        ## standardization for training stability
        eps = np.finfo(np.float32).eps.item()

        ## eps is added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Line 7:
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        # Line 8: PyTorch prefers gradient descent
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [7]:
hyperparameters = {
    "h_size": 64,
    "h2_size": 512,
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [8]:
# Create policy and place it to the device
torch.manual_seed(50) # Don't change this
tetris_policy = Policy(hyperparameters["state_space"], hyperparameters["action_space"], hyperparameters["h_size"], hyperparameters["h2_size"]).to(device)
tetris_optimizer = optim.Adam(tetris_policy.parameters(), lr=hyperparameters["lr"])

[W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.


In [13]:
# Loading existing policy
state = torch.load('./training_state_2023.12.07-11.04.44.data')
tetris_optimizer.load_state_dict(state['optimizer'])
tetris_policy.load_state_dict(state['model'])

<All keys matched successfully>

In [9]:
first = True

In [10]:
if first:
    first = False
    scores = []
scores.extend(reinforce(tetris_policy,
                   tetris_optimizer,
                   hyperparameters["n_training_episodes"],
                   hyperparameters["max_t"],
                   hyperparameters["gamma"],
                   100))

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(f"{pre} is not within the observation space.")


Episode 100	Average Score: -2.12
Episode 200	Average Score: -2.52
Episode 300	Average Score: -2.35
Episode 400	Average Score: -2.79
Episode 500	Average Score: -2.85
Episode 600	Average Score: -2.28
Episode 700	Average Score: -2.19
Episode 800	Average Score: -2.77
Episode 900	Average Score: -2.92
Episode 1000	Average Score: -3.19
Episode 1100	Average Score: -3.17
Episode 1200	Average Score: -1.95
Episode 1300	Average Score: -2.64
Episode 1400	Average Score: -2.03
Episode 1500	Average Score: -1.70
Episode 1600	Average Score: -1.72
Episode 1700	Average Score: -1.78
Episode 1800	Average Score: -1.79
Episode 1900	Average Score: -1.75
Episode 2000	Average Score: -2.06
Episode 2100	Average Score: -3.03
Episode 2200	Average Score: -2.36
Episode 2300	Average Score: -2.74
Episode 2400	Average Score: -2.29
Episode 2500	Average Score: -1.84
Episode 2600	Average Score: -2.55
Episode 2700	Average Score: -2.80
Episode 2800	Average Score: -2.10
Episode 2900	Average Score: -1.75
Episode 3000	Average Sc

KeyboardInterrupt: 

In [11]:
# Saving current policy for continued training
from time import gmtime, strftime
curr_time = strftime("%Y.%m.%d-%H.%M.%S", gmtime())
training_state = {'model':tetris_policy.state_dict(), 'optimizer':tetris_optimizer.state_dict()}
torch.save(training_state, f'./training_state_conv_{curr_time}.data')

In [17]:
file = open('./training_scores_200k.txt','w')
for item in scores:
	file.write(f'{item} ')
file.write('\n')
file.flush()
file.close()

In [13]:
def record_video(env, policy, out_directory, fps=30):
  """
  Generate a replay video of the agent
  :param env
  :param Qtable: Qtable of our agent
  :param out_directory
  :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
  """
  images = []
  done = False
  state = env.reset()
  img = env.render()
  images.append(img)
  for frame in range(5000):
    if done:
      break
    # Take the action (index) that have the maximum expected future reward given that state
    action, _ = policy.act(state)
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render()
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [14]:
from time import gmtime, strftime
curr_time = strftime("%Y.%m.%d-%H.%M.%S", gmtime())
record_video(eval_env, tetris_policy, f'./replay_conv_{curr_time}.mp4')

