In [135]:
import gym
from gym import wrappers
import math, os, random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from collections import namedtuple
from collections import deque
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T

import ppaquette_gym_super_mario


checkpoint_dir = './checkpoints/'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
use_cuda = torch.cuda.is_available()

env = gym.make('ppaquette/SuperMarioBros-1-1-v0')
env = wrappers.Monitor(env, 'gym-results', force=True)

train = True
retrain = False

In [136]:
input_size = np.array([env.observation_space.shape[0], env.observation_space.shape[1], 15])
output_size = 13

dis = 0.9
REPLAY_MEMORY = 20000


In [137]:
def ddqn_replay_train(mainDQN, targetDQN, train_batch, l_rate):
    x_stack = np.empty(0).reshape(0, mainDQN.input_size[0]*mainDQN.input_size[1]*mainDQN.input_size[2])
    y_stack = np.empty(0).reshape(0, mainDQN.output_size)
    action_stack = np.empty(0).reshape(0, 60)
    for state, action_seq, action_next_seq, action , reward, next_state, done in train_batch:
        print(state.shape)
        Q = mainDQN.predict(state, action_seq)
        print('state',state.shape)
        print('action',action_seq[0].shape)
        if done:
            Q[0, action] = reward
        else:
            print('next_state',next_state.shape)
            print('next-action',action_next_seq[0].shape)
            Q[0, action] = reward + dis * targetDQN.predict(next_state, action_next_seq).max(1)[0]
            
            
        if state is None:
            print("None State, ", action, ", ", reward, ", ", next_state,", ", done)
        else:
            y_stack = np.vstack([y_stack, Q])
            x_stack = np.vstack([x_stack, state.reshape(-1, mainDQN.Input_size[0]*mainDQN.input_size[1]*mainDQN.input_size[2])])
            action_stack = np.vstack([action_stack, np.reshape(action_seq, (-1, 60))])
    print(x_stack.shape)
    
    Qpred = mainDQN.predict(x_stack, action_seq)
    loss = F.mse_loss(Qpred, y_stack)
    optm = optim.Adam(mainDQN.parameters())
    optm.zero_grad()
    loss.backward()
    optm.step()
    
    return loss


def bot_play(mainDQN, env=env):
    start = env.reset()
    reward_sum = 0
    while True:
        if state in None or state.size ==1:
            output = random.randint(0, output_size-1)
            action = OutputToAction3(output)
            print("random action:", output)
        else:
            output = np.argmax(mainDQN.predict(state))
            action = OutputToAction3(output)
            print("predicted action:", output)
        for n in range(len(action)):
            state, reward, done, info = env.step(action[n])
            if done:
                break
        reward_sum += reward
        if done:
            print("Total score", reward_sum)
            break
            
def OutputToAction3(output):
    actions={ # A: jump , B: run
        '0' :[[0,0,0,0,0,0], 'Nope'],
        '1' :[[1,0,0,0,0,0], 'Up'],
        '2' :[[0,0,1,0,0,0], 'Down'],
        '3' :[[0,1,0,0,0,0], 'Left'],
        '4' :[[0,1,0,0,1,0], 'Left + A'],
        '5' :[[0,1,0,0,0,1], 'Left + B'],
        '6' :[[0,1,0,0,1,1], 'Left + A + B'],
        '7' :[[0,0,0,1,0,0], 'Right'],
        '8' :[[0,0,0,1,1,0], 'Right + A'],
        '9' :[[0,0,0,1,0,1], 'Right + B'],
        '10':[[0,0,0,1,1,1], 'Right + A + B'],
        '11':[[0,0,0,0,1,0], 'A'],
        '12':[[0,0,0,0,1,1], 'A + B']
    }
    return [np.array([actions[str(output)][0]]*2), actions[str(output)][1]]
    

In [138]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T

class DQN(nn.Module):
    def __init__ (self, input_size, output_size):
        super(DQN, self).__init__()
        
        self.input_size = input_size
        self.output_size = output_size
        
        self.model1 = nn.Sequential(
            nn.Conv2d(15,64,3,1),
            nn.ReLU(),
            nn.Conv2d(64,64,3,2),
            nn.ReLU(),
            
            nn.Conv2d(64,128,3,1),
            nn.ReLU(),
            nn.Conv2d(128,128,3,1),
            nn.ReLU(),
            nn.Conv2d(128,128,3,2),
            
            nn.Conv2d(128,256,3,1),
            nn.ReLU(),
            nn.Conv2d(256,256,3,1),
            nn.ReLU(), 
            nn.Conv2d(256,256,3,2),
            
            nn.Conv2d(256,512,3,1),
            nn.ReLU(),
            nn.Conv2d(512,512,3,1),
            nn.ReLU(),
            nn.Conv2d(512,512,3,2),
            
            nn.Conv2d(512,512,3,1),
            nn.ReLU(),
            nn.Conv2d(512,512,3,1),
            nn.ReLU(),
            nn.Conv2d(512,512,3,2),
            
            nn.Conv2d(512,100,[2,3],1),
            nn.ReLU()
            
        )
        self.model2 = nn.Sequential(
            nn.Linear(160,160),
            nn.ReLU(),
            nn.Linear(160,50),
            nn.ReLU(),
            nn.Linear(50,self.output_size)
        )
    def predict(self, x, y):
        #x = np.ascontiguousarray(x, dtype=np.float32)
        x = np.transpose(x, (2,0,1))
        x = np.reshape(x, [1, int(self.input_size[2]), int(self.input_size[0]), int(self.input_size[1])])
        x = Variable(torch.Tensor(x)).type(torch.FloatTensor)
        y = Variable(torch.from_numpy(np.array(y))).type(torch.FloatTensor)
        x = self.model1(x)
        x = x.view(x.size(0), 100)
        y = y.view(-1, 60)
        z = torch.cat((x,y),1)
        return self.model2(z)
    
   

In [139]:
def main():
    if train:
        init_episode = 1
        max_episode = 10000
        replay_buffer = deque()
        state_buffer = deque()
        next_state_buffer = deque()
        output_buffer = deque()
        
        mainDQN = DQN(input_size, output_size)
        targetDQN = DQN(input_size, output_size)
        targetDQN.load_state_dict(mainDQN.state_dict())
        #if use_cuda:
        #    mainDQN.cuda()
        #    targetDQN.cuda()
        
        if retrain:
            print('how to restore in PyTorch?')
        
        for episode in range(init_episode, max_episode):
            e = 1.0 / (episode/500 + 1)
            print ("episode:", episode, ", epsilon: ", e)
            done = False
            step_count = 0
            state = env.reset()
            score = 0
            distance = 0
            prev_output = -1
            repeat = 0
            
            while not done:
                if np.random.rand(1) < e or state is None or state.size == 1 or step_count <=10:
                    output = random.randint(0, output_size -1)
                    action,action_name = OutputToAction3(output)
                    print("random action:", action_name)
                
                else:
                    predicted = mainDQN.predict(acc_state, output_seq)
                    output = np.argmax(predicted)
                    action,action_name = OutputToAction3(output)
                    print("output:",action_name, "predicted:", predicted)
                    
                for n in range(len(action)):
                    next_state, reward, done, info = env.step(action[n])
                    if done:
                        print('%dth:' %n)
                        break
                
                print('reward: ', reward)
                state_buffer.append(next_state)
                output_buffer.append(action)
                
                prev_distance = distance
                distance = info['distance']
                got_distance = distance - prev_distance
                
                past_score = score
                score = info['score']
                got_score = score - past_score
                
                time = info['time']
                
                reward = got_score/50 + got_distance/30
                
                if reward>0:
                    print("reward:", reward)
                if done:
                    reward -= 1.0
                    if distance>=3000:
                        reward = 1
                    print("last reward: ", reward)
                
                if step_count>=10:
                    acc_state = [state_buffer[-2-k] for k in range(5)]
                    state_buffer.popleft()
                    acc_state = np.reshape(acc_state, input_size[:3])
                    acc_next_state = [state_buffer[-1-k] for k in range(5)]
                    acc_next_state = np.reshape(acc_next_state, input_size[:3])
                    
                    output_seq = [output_buffer[-2-k] for k in range(5)]
                    output_next_seq = [output_buffer[-1-k] for k in range(5)]
                    output_buffer.popleft()
                    
                    replay_buffer.append((acc_state, output_seq, output_next_seq, output, reward, acc_next_state, done))
                    
                    if replay_buffer[-1][6]:
                        for k in range(1, 5):
                            replay_buffer[-1-k] = tuple(
                                replay_buffer[-1-k][0:4] + (-pow(0.9,k),) + replay_buffer[-1-k][5:]) # ??????
                    if replay_buffer[-1][4] > 2.0 and replay_buffer[-1][6] == False:
                        for k in range(1, 5):
                            replay_buffer[-1-k] = tuple(
                                replay_buffer[-1-k][0:4] + (pow(0.9,k),) + replay_buffer[-1-k][5:])
                    
                    if len(replay_buffer) > REPLAY_MEMORY:
                        replay_buffer.popleft()
                    acc_state = acc_next_state
                    
                state = next_state
                step_count += 1
                if step_count > 100000:
                    break
                    
                if (episode+1) % 1 == 0:
                    for _ in range(50):
                        if len(replay_buffer) >= 10:
                            sample_idx = random.sample(range(0, len(replay_buffer)), 10)
                            minibatch = []
                            for i in sample_idx:
                                minibatch.append(replay_buffer[i])
                            
                            l_rate = (1e-5 - 1e-4)*(1/max_episode)*episode + 1e-4
                            loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch, l_rate=l_rate)
                            
                            print("Loss: %.3f,  l_rate: %.6f" %(loss, l_rate))
                
                if (episode+1) % 2 == 0:
                    targetDQN.load_state_dict(mainDQN.state_dict())
                    print('weights copied')
                
                if (episode+1) % 100 == 0:
                    
                    pass # How to save the weights in PyTorch
            
            env2 = wrappers.Monitor(env, 'gym-results', force=True)
            for i in range(200):
                bot_play(mainDQN, env=env2)
            env2.close()
    else:
        mainDQN = dqn.DQN(input_size, output_size)
        targetDQN = dqn.DQN(input_size, output_size)
        for i in range(200):
            bot_play(mainDQN, env=env)
        env.close()
        
if __name__ == "__main__":
    main()
        
                        
                    

episode: 1 , epsilon:  0.998003992015968
random action: Left + B
reward:  0
reward: 1.3
weights copied
random action: Right + A + B
reward:  -1
weights copied
random action: Left + A + B
reward:  -1
weights copied
random action: Up
reward:  -1
weights copied
random action: Left
reward:  -2
weights copied
random action: Left + A + B
reward:  -2
weights copied
random action: Left + A + B
reward:  -2
weights copied
random action: Left + B
reward:  -3
weights copied
random action: Right + A
reward:  -3
weights copied
random action: Right + B
reward:  -2
weights copied
random action: A + B
reward:  -2
weights copied
random action: Left + B
reward:  -1
weights copied
random action: Right
reward:  0
weights copied
random action: Nope
reward:  1
reward: 0.06666666666666667
weights copied
random action: Up
reward:  0
weights copied
random action: Left + A + B
reward:  0
weights copied
random action: Up
reward:  -1
weights copied
random action: Right + A
reward:  -1
weights copied
random action:

ValueError: all the input arrays must have same number of dimensions

In [134]:
%debug

> [0;32m/usr/local/lib/python3.5/dist-packages/numpy/core/shape_base.py[0m(234)[0;36mvstack[0;34m()[0m
[0;32m    232 [0;31m[0;34m[0m[0m
[0m[0;32m    233 [0;31m    """
[0m[0;32m--> 234 [0;31m    [0;32mreturn[0m [0m_nx[0m[0;34m.[0m[0mconcatenate[0m[0;34m([0m[0;34m[[0m[0matleast_2d[0m[0;34m([0m[0m_m[0m[0;34m)[0m [0;32mfor[0m [0m_m[0m [0;32min[0m [0mtup[0m[0;34m][0m[0;34m,[0m [0;36m0[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    235 [0;31m[0;34m[0m[0m
[0m[0;32m    236 [0;31m[0;32mdef[0m [0mhstack[0m[0;34m([0m[0mtup[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m
ipdb> u
> [0;32m<ipython-input-131-d42228413344>[0m(21)[0;36mddqn_replay_train[0;34m()[0m
[0;32m     19 [0;31m            [0mprint[0m[0;34m([0m[0;34m"None State, "[0m[0;34m,[0m [0maction[0m[0;34m,[0m [0;34m", "[0m[0;34m,[0m [0mreward[0m[0;34m,[0m [0;34m", "[0m[0;34m,[0m [0mnext_state[0m[0;34m,[0m[0;34m", "[0m[0;34m,[0m [0mdone[0m