## MnistGym
In this OpenAI training gym environment, handwritten digits (0 through 9) are displayed to a reinforcement learning agent on a 128x128px canvas. A correct discrete value action for a matching observation receives a reward.

In [None]:
#! git clone https://github.com/iRyanBell/dqn_cnn_mnist_gym.git

In [None]:
pip install stable-baselines3

Collecting stable-baselines3
[?25l  Downloading https://files.pythonhosted.org/packages/76/7c/ec89fd9a51c2ff640f150479069be817136c02f02349b5dd27a6e3bb8b3d/stable_baselines3-0.10.0-py3-none-any.whl (145kB)
[K     |██▎                             | 10kB 22.7MB/s eta 0:00:01[K     |████▌                           | 20kB 14.5MB/s eta 0:00:01[K     |██████▊                         | 30kB 12.7MB/s eta 0:00:01[K     |█████████                       | 40kB 12.2MB/s eta 0:00:01[K     |███████████▎                    | 51kB 8.2MB/s eta 0:00:01[K     |█████████████▌                  | 61kB 8.4MB/s eta 0:00:01[K     |███████████████▊                | 71kB 8.7MB/s eta 0:00:01[K     |██████████████████              | 81kB 9.0MB/s eta 0:00:01[K     |████████████████████▏           | 92kB 9.2MB/s eta 0:00:01[K     |██████████████████████▌         | 102kB 7.5MB/s eta 0:00:01[K     |████████████████████████▊       | 112kB 7.5MB/s eta 0:00:01[K     |███████████████████████████   

In [None]:
import torch
import torchvision
from torch.utils.data import DataLoader

import os
import cv2
from tqdm.notebook import trange

import gym
from gym import spaces

from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.dqn.policies import CnnPolicy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zipfile
import random
import csv
import sklearn as sk
from sklearn import datasets


In [None]:
train_dataset = torchvision.datasets.MNIST('/files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor()
                             ]))
test_dataset = torchvision.datasets.MNIST('/files/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor()
                               ]))

In [None]:
#@title writing to cSv
# with open("mnist.csv","w+") as my_csv:
#     csvWriter = csv.writer(my_csv,delimiter=',')
#     csvWriter.writerows([p for p in zip(np.array(train_dataset.data), np.array(train_dataset.targets))])
# with open("mnist.t.csv","w+") as my_csv:
#     csvWriter = csv.writer(my_csv,delimiter=',')
#     csvWriter.writerows([p for p in zip(np.array(test_dataset.data), np.array(test_dataset.targets))])

In [None]:
dataLoader_shuffled= torch.utils.data.DataLoader(train_dataset,shuffle = False)

In [None]:
# randomlist = []
# for i in range(0,10000):
#   n = random.randint(1,59999)
#   randomlist.append(n)
# shuffled_data = torch.utils.data.Subset(train_dataset, randomlist)
sample_list_x = []
sample_list_y = []
for i in range(60000):
  sample_list_x.append(np.array(torch.squeeze(train_dataset[i][0])))
  sample_list_y.append(train_dataset[i][1])

train_x = torch.tensor(sample_list_x)
train_y = torch.tensor(sample_list_y)

In [None]:
# randomlist = []
# for i in range(0,10000):
#   n = random.randint(1,59999)
#   randomlist.append(n)
# shuffled_data = torch.utils.data.Subset(train_dataset, randomlist)
sample_list_x1 = []
sample_list_y1 = []
for i in range(10000):
  sample_list_x1.append(np.array(torch.squeeze(test_dataset[i][0])))
  sample_list_y1.append(test_dataset[i][1])

test_x = torch.tensor(sample_list_x1)
test_y = torch.tensor(sample_list_y1)

In [None]:
# Define the gym environment.
class MnistGym(gym.Env):
    def __init__(self, dataset,width=128, height=128, channels=1):
        # Training dataset (Handwritten digits on a 28x28px canvas).
        self.X, self.y = dataset
        
        # Reset the state index, used to step through dataset.
        self.idx = 0
        
        # Digits 0-9 are valid actions.
        self.action_space = spaces.Discrete(10)
        
        # A 1-channel canvas is used for observations.
        self.observation_space = spaces.Box(low=0, high=255, shape=(width, height, channels), dtype=np.uint8)
    def _obs(self):
        # Return a frame at the target dimensions from self.X at the current state index for the CnnPolicy.
        width, height, channels = (self.observation_space.shape[0],
                                   self.observation_space.shape[1],
                                   self.observation_space.shape[2])
        obs = self.X[self.idx]
        
        # Enlarge the observation if the dataset is smaller than the target canvas.
        if obs.shape[0] < width or obs.shape[1] < height:
            obs = cv2.resize(np.array(obs).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
            obs = obs.reshape(width, height, channels)
        return obs
    def step(self, action):
        # The agent earns 1 point for a correct label.
        reward = 1 if action == self.y[self.idx] else 0
        
        # The state index increments at each step then wraps around at the end of the training dataset.
        self.idx = self.idx + 1 if self.idx < len(self.X) - 1 else 0
        
        # Return the observation, earned reward, terminal state, and info dict.
        return self._obs(), reward, self.idx == 0, {}
    def reset(self):
        # Reset the index to the beginning of the training dataset and return the initial observation.
        self.idx = 0
        return self._obs()
    def render(self, action='', mode='human', close=False):
        # Display the labeled observation.
        width, height = self.observation_space.shape[0], self.observation_space.shape[1]
        fig, ax = plt.subplots(1)
        ax.imshow(self._obs().reshape(width, height), cmap='Greys')
        
        # Label with the correct value and action if supplied. 
        title = '{}-{}'.format(action, self.y[self.idx]) if action != '' else self.y[self.idx]
        ax.set_title(title)
        plt.show()

In [None]:
#@title Complete data 60k
# Load the custom gym into a vectorized environment.
env = DummyVecEnv([lambda: MnistGym(width=64, height=64, channels=1, dataset=(train_dataset.data, train_dataset.targets))])

# Grab the observation shape for generating evaluation frames.
width, height = env.observation_space.shape[0], env.observation_space.shape[1]

In [None]:
#@title complete 60 k - create model
def create_model(pretrained=False, save_model=True, epochs=2):
    model_name = "dqn_cnn_mnist.zip"
    
    # Return a pretrained model if the flag is set. Otherwise, train a new model.
    if pretrained:
        return DQN.load(model_name)

    # Create a model from a DQN agent with a CnnPolicy attached to a tensorboard logger.
    
    # Train the model on several epochs through the full training dataset.
    model = DQN(CnnPolicy, env, verbose=1)

    model.learn(total_timesteps=len(train_dataset.data) * epochs)
    
    # Save the new model if the flag is set.
    if save_model:
        model.save(model_name)

    return model

inter_model = create_model(pretrained=False, save_model=False, epochs=2)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
#@title 60k - eval
# Evaluate the model by counting the total rewards attained on the test dataset.
total_rewards = 0
pred_y_test = []

for idx in trange(len(train_dataset.data)):
    # Generate an evaluation observation frame.
    obs = cv2.resize(np.array(train_dataset.data[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
    obs = obs.reshape(width, height, 1)
    
    # Predict an action based on the observation.
    action, _states = inter_model.predict(obs)
    pred_y_test.append(action)


    # Score the prediction.
    reward = 1 if action == train_dataset.targets[idx] else 0
    total_rewards += reward

print('Accuracy: {:.2f}%'.format(total_rewards / len(train_dataset.data) * 100.0))

In [None]:
# Load the custom gym into a vectorized environment.
env = DummyVecEnv([lambda: MnistGym(width=64, height=64, channels=1, dataset=(train_x[:10000], train_y[:10000]))])

# Grab the observation shape for generating evaluation frames.
width, height = env.observation_space.shape[0], env.observation_space.shape[1]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.chdir("/content/drive/MyDrive/dqn")
#os.chdir("/content/drive/MyDrive/dqn_mnist")

In [None]:
os.getcwd()

'/content/drive/MyDrive/dqn'

In [None]:
#@title 10 k - create model
def create_model(pretrained=False, save_model=True, epochs=2):
    model_name = "dqn_cnn_mnist.zip"
    
    # Return a pretrained model if the flag is set. Otherwise, train a new model.
    if pretrained:
        return DQN.load(model_name)

    # Create a model from a DQN agent with a CnnPolicy attached to a tensorboard logger.
    
    # Train the model on several epochs through the full training dataset.
    inter_model = DQN(CnnPolicy, env, verbose=1)
    inter_model.learn(total_timesteps=len(train_x[:10000]) * epochs)
    
    # Save the new model if the flag is set.
    if save_model:
        inter_model.save(model_name)

    return inter_model

inter_model = create_model(pretrained=False, save_model=True, epochs=2)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [None]:
# Evaluate the model by counting the total rewards attained on the train dataset.
total_rewards = 0
pred_y_train = []

for idx in trange(len(train_x)):
    # Generate an evaluation observation frame.
    obs = cv2.resize(np.array(train_x[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
    obs = obs.reshape(width, height, 1)
    
    # Predict an action based on the observation.
    action, _states = inter_model.predict(obs)
    pred_y_train.append(action)


    # Score the prediction.
    reward = 1 if action == train_y[idx] else 0
    total_rewards += reward

print('Accuracy: {:.2f}%'.format(total_rewards / len(train_x) * 100.0))

HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))


Accuracy: 9.44%


In [None]:
with open("train.csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows([p for p in zip(np.array(train_x), np.array(train_y),pred_y_train)])


In [None]:
# Evaluate the model by counting the total rewards attained on the test dataset.
total_rewards = 0
pred_y_test = []

for idx in trange(len(test_x)):
    # Generate an evaluation observation frame.
    obs = cv2.resize(np.array(test_x[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
    obs = obs.reshape(width, height, 1)
    
    # Predict an action based on the observation.
    action, _states = inter_model.predict(obs)
    pred_y_test.append(action)


    # Score the prediction.
    reward = 1 if action == test_y[idx] else 0
    total_rewards += reward

print('Accuracy: {:.2f}%'.format(total_rewards / len(test_x) * 100.0))

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Accuracy: 9.62%


In [None]:
with open("test.csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows([p for p in zip(np.array(test_x), np.array(test_y),pred_y_test)])


In [None]:
#@title 50k shuffle trial
randomlist_50000 = []
for i in range(0,50000):
  n = random.randint(1,59999)
  randomlist_50000.append(n)
shuffled_data_50000 = torch.utils.data.Subset(train_dataset, randomlist_50000)
sample_list_x_50000 = []
sample_list_y_50000 = []
for i in range(50000):
  sample_list_x_50000.append(np.array(torch.squeeze(shuffled_data_50000[i][0])))
  sample_list_y_50000.append(shuffled_data_50000[i][1])
train_x_50000 = torch.tensor(sample_list_x_50000)
train_y_50000 = torch.tensor(sample_list_y_50000)

In [None]:
# Load the custom gym into a vectorized environment.
env_further = DummyVecEnv([lambda: MnistGym(width=64, height=64, channels=1, dataset=(train_x[10000:60000], train_y[10000:60000]))])

# Grab the observation shape for generating evaluation frames.
width, height = env_further.observation_space.shape[0], env_further.observation_space.shape[1]

In [None]:
def continue_training(pretrained=True, save_model=True, epochs=2):
    model_name = "dqn_cnn_mnist.zip"
    
    # Return a pretrained model if the flag is set. Otherwise, train a new model.
    if pretrained:
      model = DQN.load(model_name)
      model.set_env(env_further)
      model.learn(total_timesteps=len(train_x[10000:60000]) * epochs)
    
    # Save the new model if the flag is set.
    if save_model:
        model.save("dqn_cnn_mnist_further.zip")

    return model

model = continue_training(pretrained=True, save_model=True, epochs=2)

Wrapping the env in a VecTransposeImage.


In [None]:
# Evaluate the model by counting the total rewards attained on the test dataset.
total_rewards = 0
pred_y_train_50000 = []

for idx in trange(len(train_x)):
    # Generate an evaluation observation frame.
    obs = cv2.resize(np.array(train_x[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
    obs = obs.reshape(width, height, 1)
    
    # Predict an action based on the observation.
    action, _states = model.predict(obs)
    pred_y_train_50000.append(action)


    # Score the prediction.
    reward = 1 if action == train_y[idx] else 0
    total_rewards += reward

print('Accuracy: {:.2f}%'.format(total_rewards / len(train_x) * 100.0))

HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))


Accuracy: 75.44%


In [None]:
with open("train_50000.csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows([p for p in zip(np.array(train_x), np.array(train_y),pred_y_train_50000)])


In [None]:
# Evaluate the model by counting the total rewards attained on the test dataset.
total_rewards = 0
pred_y_test_50000 = []

for idx in trange(len(test_x)):
    # Generate an evaluation observation frame.
    obs = cv2.resize(np.array(test_x[idx]).astype(np.float32), (width, height), interpolation = cv2.INTER_CUBIC)
    obs = obs.reshape(width, height, 1)
    
    # Predict an action based on the observation.
    action, _states = model.predict(obs)
    pred_y_test_50000.append(action)


    # Score the prediction.
    reward = 1 if action == test_y[idx] else 0
    total_rewards += reward

print('Accuracy: {:.2f}%'.format(total_rewards / len(test_x) * 100.0))

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


Accuracy: 76.32%


In [None]:
with open("test_50000.csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows([p for p in zip(np.array(test_x), np.array(test_y),pred_y_test_50000)])


In [None]:
def save_svmlight_data(data, labels, data_filename, data_folder = ''):
    file = open(data_folder+data_filename,'w')

    for i,x in enumerate(data):
        # print(x.shape)
        # print(i)
        indexes = i
        values = np.ravel(np.array(x))
        # print(indexes)
        # print(values.shape)

        label = '%i'%(labels[i])
        pairs = ['%i:%f'%(i+1,values[i]) for i in range(784)]
        # print(pairs)

        sep_line = [label]
        sep_line.extend(pairs)
        sep_line.append('\n')
        # print(sep_line)

        line = ' '.join(sep_line)

        file.write(line)
        
save_svmlight_data(train_x, train_y, "mnist_data")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



['1', '1:0.000000', '2:0.000000', '3:0.000000', '4:0.000000', '5:0.000000', '6:0.000000', '7:0.000000', '8:0.000000', '9:0.000000', '10:0.000000', '11:0.000000', '12:0.000000', '13:0.000000', '14:0.000000', '15:0.000000', '16:0.000000', '17:0.000000', '18:0.000000', '19:0.000000', '20:0.000000', '21:0.000000', '22:0.000000', '23:0.000000', '24:0.000000', '25:0.000000', '26:0.000000', '27:0.000000', '28:0.000000', '29:0.000000', '30:0.000000', '31:0.000000', '32:0.000000', '33:0.000000', '34:0.000000', '35:0.000000', '36:0.000000', '37:0.000000', '38:0.000000', '39:0.000000', '40:0.000000', '41:0.000000', '42:0.000000', '43:0.000000', '44:0.000000', '45:0.000000', '46:0.000000', '47:0.000000', '48:0.000000', '49:0.000000', '50:0.000000', '51:0.000000', '52:0.000000', '53:0.000000', '54:0.000000', '55:0.000000', '56:0.000000', '57:0.000000', '58:0.000000', '59:0.000000', '60:0.000000', '61:0.000000', '62:0.000000', '63:0.000000', '64:0.000000', '65:0.000000', '66:0.000000', '67:0.000000'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



['4', '1:0.000000', '2:0.000000', '3:0.000000', '4:0.000000', '5:0.000000', '6:0.000000', '7:0.000000', '8:0.000000', '9:0.000000', '10:0.000000', '11:0.000000', '12:0.000000', '13:0.000000', '14:0.000000', '15:0.000000', '16:0.000000', '17:0.000000', '18:0.000000', '19:0.000000', '20:0.000000', '21:0.000000', '22:0.000000', '23:0.000000', '24:0.000000', '25:0.000000', '26:0.000000', '27:0.000000', '28:0.000000', '29:0.000000', '30:0.000000', '31:0.000000', '32:0.000000', '33:0.000000', '34:0.000000', '35:0.000000', '36:0.000000', '37:0.000000', '38:0.000000', '39:0.000000', '40:0.000000', '41:0.000000', '42:0.000000', '43:0.000000', '44:0.000000', '45:0.000000', '46:0.000000', '47:0.000000', '48:0.000000', '49:0.000000', '50:0.000000', '51:0.000000', '52:0.000000', '53:0.000000', '54:0.000000', '55:0.000000', '56:0.000000', '57:0.000000', '58:0.000000', '59:0.000000', '60:0.000000', '61:0.000000', '62:0.000000', '63:0.000000', '64:0.000000', '65:0.000000', '66:0.000000', '67:0.000000'

In [None]:
def save_svmlight_data(data, labels, data_filename, data_folder = ''):
    file = open(data_folder+data_filename,'w')

    for i,x in enumerate(data):
        # print(x.shape)
        # print(i)
        indexes = i
        values = np.ravel(np.array(x))
        # print(indexes)
        # print(values.shape)

        label = '%i'%(labels[i])
        pairs = ['%i:%f'%(i+1,values[i]) for i in range(784)]
        # print(pairs)

        sep_line = [label]
        sep_line.extend(pairs)
        sep_line.append('\n')
        print(sep_line)

        line = ' '.join(sep_line)

        file.write(line)
        
save_svmlight_data(test_x, test_y, "mnist_data.t")

In [None]:
os.chdir(r"/content/drive/MyDrive/IDRL/Shipra/MMD-critic-master.zip (Unzipped Files)/MMD-critic-master")

In [None]:
# ! python Helper.py
# ! python classify.py
# ! python data.py
# ! python mmd.py
# ! python run_digits.py

[7. 6. 5. ... 4. 1. 2.]
Running Kernel type : local 
0 68
1 57
2 40
3 36
4 36
5 33
6 37
7 36
8 30
9 37
number of images being printed 410
counter 1 
 begin 0, end 40
Traceback (most recent call last):
  File "run_digits.py", line 224, in <module>
    main(data_prefix, output_prefix, gamma, m, alltestm, kernel_type, do_output_pics)
  File "run_digits.py", line 187, in main
    os.path.join(data_prefix, 'mnistt.t'))
  File "run_digits.py", line 118, in run
    plotfigs2(digitsdat.X[selected[sortedindx], :], selectedy[sortedindx], outfig)
  File "run_digits.py", line 77, in plotfigs2
    plt.savefig(file , dpi=2000)
  File "/usr/local/lib/python3.6/dist-packages/matplotlib/pyplot.py", line 723, in savefig
    res = fig.savefig(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/matplotlib/figure.py", line 2203, in savefig
    self.canvas.print_figure(fname, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/matplotlib/backend_bases.py", line 2126, in print_figure
    **k