In [1]:
import gym
import numpy as np

from keras import layers
from keras.models import Model
from keras import backend as K
from keras import utils as np_utils
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout,  Flatten
from keras.optimizers import Adam
from keras import regularizers
from keras.layers import Conv2D, MaxPooling2D

Using TensorFlow backend.


In [2]:
mspacman_color = 210 + 164 + 74
def preprocess_observation(obs):
    img = obs[33:209:2, ::2] # crop and downsize
    img = img.sum(axis=2) # to greyscale
    img[img==mspacman_color] = 0 # Improve contrast
    img = (img // 3 - 128).astype(np.int8) # normalize from -128 to 127
    return img.reshape(88, 80, 1)

In [3]:
def compute_discounted_R(R, discount_rate=0.99):
    
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):

        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add

    discounted_r -= discounted_r.mean() / discounted_r.std()

    return discounted_r

In [4]:
class Agent(object):

    def __init__(self, input_dim, output_dim, hidden_dims=[32, 32]):
        

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.__build_network(input_dim, output_dim, hidden_dims)
        self.__build_train_fn()

    def __build_network(self, input_dim, output_dim, hidden_dims=[32, 32]):
        
        self.model = Sequential()
       
        # self.model.add(Dense(32, activation='relu', kernel_initializer = 'glorot_normal', input_shape=[input_dim]))
        # self.model.add(Dense(32, activation='relu', kernel_initializer = 'glorot_normal'))
        self.model.add(Conv2D(15, (3, 3), activation='sigmoid', input_shape=input_dim))
        self.model.add(Conv2D(15, (3, 3), activation='relu', ))
#         self.model.add(MaxPooling2D(pool_size=(2,2)))
        self.model.add(Flatten())
        self.model.add(Dense(100, activation='relu'))        
        
        self.model.add(Dense(output_dim, activation='softmax'))

    def __build_train_fn(self):
        
        action_prob_placeholder = self.model.output
        action_onehot_placeholder = K.placeholder(shape=(None, self.output_dim),
                                                  name="action_onehot")
        discount_reward_placeholder = K.placeholder(shape=(None,),
                                                    name="discount_reward")

        action_prob = K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1)
        log_action_prob = K.log(action_prob)

        loss = - log_action_prob * discount_reward_placeholder
        loss = K.mean(loss)

        adam = optimizers.Adam()

        updates = adam.get_updates(params=self.model.trainable_weights,
                                   # constraints=[],
                                   loss=loss)

        self.train_fn = K.function(inputs=[self.model.input,
                                           action_onehot_placeholder,
                                           discount_reward_placeholder],
                                   outputs=[],
                                   updates=updates)

    def get_action(self, state):
        
        shape = state.shape
        
        state = np.expand_dims(state, axis=0)

        
        action_prob = np.squeeze(self.model.predict(state))
        # assert len(action_prob) == self.output_dim, "{} != {}".format(len(action_prob), self.output_dim)
        return np.random.choice(np.arange(self.output_dim), p=action_prob)

    def fit(self, S, A, R):
       
        action_onehot = np_utils.to_categorical(A, num_classes=self.output_dim)
        discount_reward = compute_discounted_R(R)

        
        self.train_fn([S, action_onehot, discount_reward])
        
    def save(self):
        self.model.save_weights("modelFile.h5")

    def load(self):
        self.model.load_weights("modelFile.h5")

In [5]:
def run_episode(env, agent):
    """Returns an episode reward
    (1) Play until the game is done
    (2) The agent will choose an action according to the policy
    (3) When it's done, it will train from the game play
    Args:
        env (gym.env): Gym environment
        agent (Agent): Game Playing Agent
    Returns:
        total_reward (int): total reward earned during the whole episode
    """
    done = False
    S = []
    A = []
    R = []

    s = env.reset()

    total_reward = 0

    while not done:
        s = preprocess_observation(s)
        a = 0
        if(len(S) == 0):
            a = agent.get_action(s)
        else:
            a = agent.get_action(np.maximum(s, S[-1]))    
        s2, r, done, info = env.step(a)
        total_reward += r

        S.append(s)
        A.append(a)
        R.append(r)

        s = s2
        
        if done:
            S = np.array(S)
            A = np.array(A)
            R = np.array(R)
            agent.fit(S, A, R)

    return total_reward


In [None]:
env = gym.make("Pong-v0")
input_dim = [88, 80, 1]
output_dim = env.action_space.n
agent = Agent(input_dim, output_dim, [16, 16])

In [None]:


a = 0;
while True:
    a += 1
    reward = run_episode(env, agent)
    print(episode, reward)
    if( a )
    
    


0 -21.0
1 -21.0
2 -21.0
3 -21.0
4 -21.0
5 -21.0
6 -21.0
7 -21.0
8 -21.0
9 -21.0
10 -21.0
11 -21.0
12 -21.0
13 -21.0
14 -21.0
15 -21.0
16 -21.0
17 -21.0
18 -21.0
19 -21.0
20 -21.0
21 -21.0
22 -21.0
23 -21.0
24 -21.0
25 -21.0
26 -21.0
27 -21.0
28 -21.0
29 -21.0
30 -21.0
31 -21.0
32 -21.0
33 -21.0
34 -21.0
35 -21.0
36 -21.0
37 -21.0
38 -21.0
39 -21.0
40 -21.0
41 -21.0
42 -21.0
43 -21.0
44 -21.0
45 -21.0
46 -21.0
47 -21.0
48 -21.0
49 -21.0
50 -21.0
51 -21.0
52 -21.0
53 -21.0
54 -21.0
55 -21.0
56 -21.0
57 -21.0
58 -21.0
59 -21.0
60 -21.0
61 -21.0
62 -21.0
63 -21.0
64 -21.0
65 -21.0
66 -21.0
67 -21.0
68 -21.0
69 -21.0
70 -21.0
71 -21.0
72 -21.0
73 -21.0
74 -21.0
75 -21.0
76 -21.0
77 -21.0
78 -21.0
79 -21.0
80 -21.0
81 -21.0
82 -21.0
83 -21.0
84 -21.0
85 -21.0
86 -21.0
87 -21.0
88 -21.0
89 -21.0
90 -21.0
91 -21.0
92 -21.0
93 -21.0
94 -21.0
95 -21.0
96 -21.0
97 -21.0
98 -21.0
99 -21.0
100 -21.0
101 -21.0
102 -21.0
103 -21.0
104 -21.0
105 -21.0
106 -21.0
107 -21.0
108 -21.0
109 -21.0
110 -21.0
