In [3]:
! pip install gym==0.15.4

Collecting gym==0.15.4
  Downloading gym-0.15.4.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m571.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting pyglet<=1.3.2,>=1.2.0 (from gym==0.15.4)
  Downloading pyglet-1.3.2-py2.py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m705.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting cloudpickle~=1.2.0 (from gym==0.15.4)
  Downloading cloudpickle-1.2.2-py2.py3-none-any.whl (25 kB)
Collecting opencv-python (from gym==0.15.4)
  Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting future (from pyglet<=1.3.2,>=1.2.0->gym==0.15.4)
  Downloading future-0.18.3.tar.gz (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.9/840.9 kB[0m [31m810.9 kB/s[0m eta [36m0

In [4]:
import gym
import torch

In [5]:
env = gym.make('CartPole-v0')

In [6]:
number_observation_features = env.observation_space.shape[0]
number_actions = env.action_space.n

In [7]:
number_observation_features

4

In [8]:
number_actions

2

In [9]:
class MLP(torch.nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLP, self).__init__()
        layers = []
        sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(len(sizes) - 1):
            layers.append(torch.nn.Linear(sizes[i], sizes[i + 1]))
            if i < len(sizes) - 2:
                layers.append(torch.nn.ReLU())
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [10]:
model = MLP(number_observation_features, [16,8], number_actions)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

In [11]:

from torch.distributions.categorical import Categorical

def get_policy(model, observation):
    observation_tensor = torch.as_tensor(observation, dtype=torch.float32)
    logits = model(observation_tensor)

    # Categorical will also normalize the logits for us
    return Categorical(logits=logits)

In [12]:
def get_action(policy):
    action = policy.sample()  # Unit tensor

    # Converts to an int, as this is what Gym environments require
    action_int = int(action.item())

    # Calculate the log probability of the action, which is required for
    # calculating the loss later
    log_probability_action = policy.log_prob(action)

    return action_int, log_probability_action

In [13]:
def calculate_loss(epoch_log_probability_actions, epoch_action_rewards):
    return -(epoch_log_probability_actions * epoch_action_rewards).sum()

In [14]:
import numpy as np
import torch.nn as nn
from torch.optim import Adam, Optimizer

def train_one_epoch(env,model,optimizer,max_timesteps,episode_timesteps):
    epoch_total_timesteps = 0

    # Returns from each episode (to keep track of progress)
    epoch_returns: list[float] = []

    # Action log probabilities and rewards per step (for calculating loss)
    epoch_log_probability_actions = []
    epoch_action_rewards = []

    # Loop through episodes
    while True:

        # Stop if we've done over the total number of timesteps
        if epoch_total_timesteps > max_timesteps:
            break

        # Running total of this episode's rewards
        episode_reward: float = 0

        # Reset the environment and get a fresh observation
        observation = env.reset()

        # Loop through timesteps until the episode is done (or the max is hit)
        for timestep in range(episode_timesteps):
            epoch_total_timesteps += 1

            # Get the policy and act
            policy = get_policy(model, observation)
            action, log_probability_action = get_action(policy)
            observation, reward, done, _ = env.step(action)

            # Increment the episode rewards
            episode_reward += reward

            # Add epoch action log probabilities
            epoch_log_probability_actions.append(log_probability_action)

            # Finish the action loop if this episode is done
            if done is True:
                # Add one reward per timestep
                for _ in range(timestep + 1):
                    epoch_action_rewards.append(episode_reward)

                break

        # Increment the epoch returns
        epoch_returns.append(episode_reward)

    # Calculate the policy gradient, and use it to step the weights & biases
    epoch_loss = calculate_loss(torch.stack(
        epoch_log_probability_actions),
        torch.as_tensor(
        epoch_action_rewards, dtype=torch.float32)
    )

    epoch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return float(np.mean(epoch_returns))

In [15]:
for epoch in range(num_epochs):
        average_return = train_one_epoch(env, model, optimizer,5000,200)
        if (epoch+1)%10==0:
           print('epoch: %3d \t return: %.3f' % (epoch+1, average_return))

epoch:  10 	 return: 23.806
epoch:  20 	 return: 24.335
epoch:  30 	 return: 26.010
epoch:  40 	 return: 28.680
epoch:  50 	 return: 27.656
epoch:  60 	 return: 32.096
epoch:  70 	 return: 35.338
epoch:  80 	 return: 39.148
epoch:  90 	 return: 44.786
epoch: 100 	 return: 49.554


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# model_scripted = torch.jit.script(model) # Export to TorchScript
# model_scripted.save('/content/drive/MyDrive/model.pt') # Save

In [15]:
import torch

model = torch.jit.load('./model.pt')
model.eval()

RecursiveScriptModule(
  original_name=MLP
  (model): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Linear)
    (1): RecursiveScriptModule(original_name=ReLU)
    (2): RecursiveScriptModule(original_name=Linear)
    (3): RecursiveScriptModule(original_name=ReLU)
    (4): RecursiveScriptModule(original_name=Linear)
  )
)

In [16]:
!pip install gym pyvirtualdisplay imageio
!sudo apt-get install xvfb

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Collecting imageio
  Downloading imageio-2.33.1-py3-none-any.whl.metadata (4.9 kB)
Downloading imageio-2.33.1-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.3/313.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: pyvirtualdisplay, imageio
Successfully installed imageio-2.33.1 pyvirtualdisplay-3.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[sudo] password for asus: 


In [17]:
!pip install PyOpenGL

Collecting PyOpenGL
  Downloading PyOpenGL-3.1.7-py3-none-any.whl.metadata (3.2 kB)
Downloading PyOpenGL-3.1.7-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: PyOpenGL
Successfully installed PyOpenGL-3.1.7

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
!pip install gym pyvirtualdisplay

from pyvirtualdisplay import Display
import gym
import numpy as np
import cv2

# Start virtual display
display = Display(visible=0, size=(1400, 900))
display.start()

# Create Gym environment
env = gym.make('CartPole-v0')

# Define a function to record frames
def record_frames(env):
    frames = []
    obs = env.reset()
    while True:
        frame = env.render(mode='rgb_array')
        frames.append(frame)
        policy = get_policy(model, obs)
        action, log_probability_action = get_action(policy)
        obs, reward, done, info = env.step(action)
        if done:
            break
    return frames

# Record frames for each episode
num_episodes = 10
all_frames = []
for i in range(num_episodes):
    frames = record_frames(env)
    all_frames.extend(frames)

# Close the environment
env.close()

# Stop virtual display
display.stop()

# Convert frames to a video
height, width, _ = all_frames[0].shape
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('video.avi', fourcc, 30.0, (width, height))
for frame in all_frames:
    out.write(frame)
out.release()

print("Video saved as video.avi")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Video saved as video.avi


In [27]:
import gym
from gym.wrappers.monitoring.video_recorder import VideoRecorder
import os
from pyvirtualdisplay import Display
from collections.abc import Iterable  # Import Iterable from collections.abc


def record_video(env_name, video_name):
    env = gym.make(env_name)
    video_recorder = VideoRecorder(env, './' + video_name + '.mp4', enabled=True)
    obs = env.reset()
    while True:
              video_recorder.capture_frame()
              policy = get_policy(model, obs)
              action, log_probability_action = get_action(policy)
              obs, reward, done, info = env.step(action)
              env.render(mode='rgb_array')
              if done:
                break
    video_recorder.close()

record_video('CartPole-v0', 'cartpole_video')