In [1]:
import torch
import torchvision
from torch.utils.data import DataLoader
from torch import nn

import os
import cv2
from tqdm.notebook import trange

import gym
from gym import spaces

from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn.policies import CnnPolicy, DQNPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.preprocessing import get_flattened_obs_dim, is_image_space
from typing import Any, Callable, Dict, List, NamedTuple, Tuple, Union, Optional, Type

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import sklearn as sk
from sklearn import datasets
# from stable_baselines3.common.policies import register_policy
# register_policy("myPolicy", CnnPolicy)

In [11]:
Schedule = Callable[[float], float]

class CustomCNN(BaseFeaturesExtractor):
    """
    CNN from DQN nature paper:
        Mnih, Volodymyr, et al.
        "Human-level control through deep reinforcement learning."
        Nature 518.7540 (2015): 529-533.
    :param observation_space:
    :param features_dim: Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        assert is_image_space(observation_space), (
            "You should use NatureCNN "
            f"only with images not with {observation_space}\n"
            "(you are probably using `CnnPolicy` instead of `MlpPolicy`)\n"
            "If you are using a custom environment,\n"
            "please check it using our env checker:\n"
            "https://stable-baselines3.readthedocs.io/en/master/common/env_checker.html"
        )
        n_input_channels = observation_space.shape[0]
#         self.cnn = nn.Sequential(
#             nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
#             nn.ReLU(),
#             nn.Dropout(p=0.2),
#             nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
#             nn.ReLU(),
#             nn.Dropout(p=0.3),
#             nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
#             nn.ReLU(),
#             nn.Flatten(),
#         )

        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(2,stride=2),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.Dropout(p=0.4),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

# policy_kwargs = dict(
#     features_extractor_class=CustomCNN,
#     features_extractor_kwargs=dict(features_dim=64),
# )
# model = PPO("CnnPolicy", "BreakoutNoFrameskip-v4", policy_kwargs=policy_kwargs, verbose=1)

class CustomPolicy(DQNPolicy):
    """
    Policy class for DQN when using images as input.
    :param observation_space: Observation space
    :param action_space: Action space
    :param lr_schedule: Learning rate schedule (could be constant)
    :param net_arch: The specification of the policy and value networks.
    :param activation_fn: Activation function
    :param features_extractor_class: Features extractor to use.
    :param normalize_images: Whether to normalize images or not,
         dividing by 255.0 (True by default)
    :param optimizer_class: The optimizer to use,
        ``torch.optim.Adam`` by default
    :param optimizer_kwargs: Additional keyword arguments,
        excluding the learning rate, to pass to the optimizer
    """

    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        lr_schedule: Schedule,
        net_arch: Optional[List[int]] = None,
        activation_fn: Type[nn.Module] = nn.ReLU,
        features_extractor_class: Type[BaseFeaturesExtractor] = CustomCNN,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        normalize_images: bool = True,
        optimizer_class: Type[torch.optim.Optimizer] = torch.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
    ):
        super(CustomPolicy, self).__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            features_extractor_class,
            features_extractor_kwargs,
            normalize_images,
            optimizer_class,
            optimizer_kwargs,
        )

In [12]:
# Define the gym environment.
class USPSGym(gym.Env):
    def __init__(self, dataset,width=128, height=128, channels=1):
        # Training dataset (Handwritten digits on a 16x16px canvas).
        self.X, self.y = dataset
        
        # Reset the state index, used to step through dataset.
        self.idx = 0
        
        # Digits 0-9 are valid actions.
        self.action_space = spaces.Discrete(10)
        
        # A 1-channel canvas is used for observations.
        self.observation_space = spaces.Box(low=0, high=255, shape=(width, height, channels), dtype=np.uint8)
    def _obs(self):
        # Return a frame at the target dimensions from self.X at the current state index for the CnnPolicy.
        width, height, channels = (self.observation_space.shape[0],
                                   self.observation_space.shape[1],
                                   self.observation_space.shape[2])
        obs = self.X[self.idx]
        
        # Enlarge the observation if the dataset is smaller than the target canvas.
        if obs.shape[0] < width or obs.shape[1] < height:
            obs = cv2.resize(np.array(obs).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
            obs = obs.reshape(width, height, channels)
        return obs
    def step(self, action):
        # The agent earns 1 point for a correct label.
        reward = 1 if action == self.y[self.idx] else 0
        
        # The state index increments at each step then wraps around at the end of the training dataset.
        self.idx = self.idx + 1 if self.idx < len(self.X) - 1 else 0
        
        # Return the observation, earned reward, terminal state, and info dict.
        return self._obs(), reward, self.idx == 0, {}
    def reset(self):
        # Reset the index to the beginning of the training dataset and return the initial observation.
        self.idx = 0
        return self._obs()
    def render(self, action='', mode='human', close=False):
        # Display the labeled observation.
        width, height = self.observation_space.shape[0], self.observation_space.shape[1]
        fig, ax = plt.subplots(1)
        ax.imshow(self._obs().reshape(width, height), cmap='Greys')
        
        # Label with the correct value and action if supplied. 
        title = '{}-{}'.format(action, self.y[self.idx]) if action != '' else self.y[self.idx]
        ax.set_title(title)
        plt.show()

In [4]:
from sklearn.datasets import load_svmlight_file
from dqn import *
import numpy as np
from stable_baselines3 import DQN


train = load_svmlight_file('data/mmd_mnist_torch')
x, y = train
x = x.todense()

sortind = np.argsort(y)
x = x[sortind, :]
y = y[sortind]

test = load_svmlight_file('data/mmd_mnist_torch.t')
testx, testy = test
testx = testx.todense()

In [13]:
#@title Complete data 7291
# Load the custom gym into a vectorized environment.
env = DummyVecEnv([lambda: USPSGym(width=64, height=64, channels=1, dataset=(x, y))])

# Grab the observation shape for generating evaluation frames.
width, height = env.observation_space.shape[0], env.observation_space.shape[1]

In [14]:
#@title complete 60000 - create model
def create_model(pretrained=False, save_model=True, epochs=2):
    model_name = "dqn_cnn_mnist_{}_custom_2_tarun.zip".format(epochs)
    
    # Return a pretrained model if the flag is set. Otherwise, train a new model.
    if pretrained:
        return DQN.load(model_name)

    # Create a model from a DQN agent with a CnnPolicy attached to a tensorboard logger.
    
    # Train the model on several epochs through the full training dataset.
#     model = DQN(CnnPolicy, env, policy_kwargs=policy_kwargs, verbose=1)
    model = DQN(CustomPolicy, env, verbose=1)
    
    model.learn(total_timesteps=len(x) * epochs)
#     model.learn(total_timesteps=300000, log_interval=1)
    
    # Save the new model if the flag is set.
    if save_model:
        model.save(model_name)

    return model

In [15]:
# #@title eval
# # Evaluate the model by counting the total rewards attained on the test dataset.
# total_rewards = 0
# pred_y_test = []
# count = 0

# for idx in trange(len(testx)):
#     # Generate an evaluation observation frame.
#     obs = cv2.resize(np.array(testx[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
#     obs = obs.reshape(width, height, 1)
    
#     # Predict an action based on the observation.
#     action, _states = inter_model.predict(obs)
#     pred_y_test.append(action)


#     # Score the prediction.
#     if (action.size > 1):
#         # print(action)
#         count += 1
#         pass
#     else:
#         reward = 1 if action == testy[idx] else 0
#         total_rewards += reward

# print('Accuracy: {:.2f}%'.format(total_rewards / (len(testy) - count) * 100.0))

In [16]:
epochs = [1, 5, 10, 25, 50, 75, 100, 667]
# for epoch in epochs[:1]:
for epoch in [17]:
    print(epoch, '\n')
    inter_model = create_model(pretrained=False, save_model=True, epochs=epoch)
    
    #@title eval
    # Evaluate the model by counting the total rewards attained on the test dataset.
    total_rewards = 0
    pred_y_test = []
    count = 0

    for idx in trange(len(testx)):
        # Generate an evaluation observation frame.
        obs = cv2.resize(np.array(testx[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
        obs = obs.reshape(width, height, 1)

        # Predict an action based on the observation.
        action, _states = inter_model.predict(obs)
        pred_y_test.append(action)


        # Score the prediction.
        if (action.size > 1):
            # print(action)
            count += 1
            pass
        else:
            reward = 1 if action == testy[idx] else 0
            total_rewards += reward

    print('Accuracy: {:.2f}%'.format(total_rewards / (len(testy) - count) * 100.0))
#     break

17 

Using cpu device
Wrapping the env in a VecTransposeImage.


KeyboardInterrupt: 

In [None]:
# from glob import glob
# import numpy as np

# protos = sorted(glob('protos/*'))
# critics = sorted(glob('critics/*'))
# combined = zip(protos, critics)
# combined

In [None]:
# for p, c in combined:
#     m = p.split('_')[-1].split('.')[0]
#     npro = np.load(p)
#     ncri = np.load(c)
#     c = np.concatenate((npro, ncri))
#     np.save('combined/combined_{}'.format(m), c)