In [1]:
# make sure you don't hog all the video memory
import os
import pickle
import time
import tensorflow as tf
from tensorflow.python.client import device_lib
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
# os.environ["CUDAa_VISIBLE_DEVICES"]="2"
###################################

from keras import backend as K
from keras.optimizers import RMSprop, Adam, SGD
from keras.layers import Input, Dense, Dropout, Flatten, ELU, Activation, Conv2D, Lambda
from keras.models import Model, load_model

import gym
import numpy as np
from collections import deque

# import matplotlib
# matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline

from skimage.transform import resize
from skimage.color import rgb2gray


class DQNAgent:
    def __init__(self, frames):
        self.f_len = frames
        self.memory = deque(maxlen=frames)
        self.target_shape = (80,80)
        self.state4 = np.zeros([80,80,self.f_len], dtype='float32')
        self.sum_disc_raw = []
        self.sum_raw_rew = []
        self.init_env()
        self.init_brain()

        action_prob_placeholder = self.brain.output
        action_onehot_placeholder = K.placeholder(shape=(None, 2))
        discount_reward_placeholder = K.placeholder(shape=(None,))
        action_prob = K.sum(action_prob_placeholder * action_onehot_placeholder, axis=1)
        log_action_prob = K.log(action_prob)
        loss = - log_action_prob * discount_reward_placeholder
        loss = K.sum(loss)
        adam = Adam(lr=1e-4)
        updates = adam.get_updates(params=self.brain.trainable_weights, loss=loss)
        self.train = K.function(inputs=[self.brain.input,
                                           action_onehot_placeholder,
                                           discount_reward_placeholder],
                                   outputs=[loss],
                                   updates=updates)
    def init_brain(self):
        # this is the policy network
        input_img = Input(shape=(80, 80, self.f_len))
        x = Conv2D(16, (8, 8), activation='relu', padding='valid', strides=(4,4), kernel_initializer='lecun_uniform')(input_img)
        x = Conv2D(32, (4, 4), activation='relu', padding='valid', strides=(2,2), kernel_initializer='lecun_uniform')(x)
        x = Flatten()(x)
        x = Dense(128, activation='relu', kernel_initializer='lecun_uniform')(x)
        x = Dense(2, activation='softmax')(x)
#         x = Lambda(lambda x: K.tf.nn.softmax(x))(x)
        self.brain = Model(input_img, x)
        self.brain.summary()

    def init_env(self):
        # initialize the environment
        self.env = gym.make("Pong-v0")
        self.state = self.env.reset()
        self.done = False
        
        # get the first four frames to fill the memory
        for i in range(self.f_len):
            self.action = np.random.choice([2,3])
            state, reward, self.done, _ = self.env.step(self.action)
            state = self.fi(state)
            self.memory.append(state) # нет ли смещения ????
        self.state4 = self.memory[0]
                          
    def step1(self):
        self.prev_state4 = self.state4
        new_state, self.reward, self.done, _ = self.env.step(self.action)
        # self.env.render()
        self.state4 = self.fi(new_state)
#         print('state', state.shape, 'state4', self.state4.shape)
    
    def fi(self, I):
        """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
        I = I[35:195] # crop
        I = I[::2,::2,0] # downsample by factor of 2
        I[I == 144] = 0 # erase background (background type 1)
        I[I == 109] = 0 # erase background (background type 2)
        I[I != 0] = 1 # everything else (paddles, ball) just set to 1
        return I

    def predict_next_step(self):
        img = (self.state4 - self.prev_state4)[None,:,:,None]
        act = self.brain.predict(img).flatten().tolist()
        act[1] = 1 - act[0] # this hack is needed because of numpy rounding error (probs don't sum to 1)
        self.action = np.random.choice([2,3],1, p=act) #### !!! ### !!! ###@@ you are more likely to send 2 if your policy net (brain) says [0.9 0.1]

    def train(self, states_tensor, actions_tensor, d_rewards_tensor):
        return self.train_fn([S, A, discount_reward])

agent = DQNAgent(1)

def discount_rewards(r, gamma=0.99):
    discounted_r = np.zeros_like(r, dtype='float32')
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0:
            running_add = 0
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    discounted_r = discounted_r - 0.01
    sum_disc_raw = np.sum(discounted_r)
#     discounted_r -= np.mean(discounted_r)
#     discounted_r /= np.std(discounted_r)
    return discounted_r, sum_disc_raw

def play1game(agent):
    log_state4, log_reward, log_action = [], [], []

    for subep in range(1):
        agent.init_env()
        while not agent.done:
            agent.predict_next_step()
            agent.step1()
            log_state4.append(agent.state4)
            log_reward.append(agent.reward)
            log_action.append([1,0] if agent.action==2 else [0,1]) #### !!! ### !!! ###@@ you are more likely to send 2 if your policy net (brain) says [0.9 0.1]
        d_rewards, sum_disc_raw = discount_rewards(np.array(log_reward), gamma=0.99)
        loss = agent.train([np.stack(log_state4, axis=0)[:,:,:,None], np.stack(log_action, axis=0), d_rewards])
    agent.sum_raw_rew = log_reward
    agent.sum_disc_raw = sum_disc_raw
    print('length of episode', len(d_rewards))
    return d_rewards, log_state4, log_action, loss

InternalError: Failed to create session.

In [None]:
for episode in range(15000):
    print('EPISODE', episode)
    d_rewards, log_state4, log_action, loss = play1game(agent)
    avg_up = np.mean([log_action[i][0] for i in range(len(log_action))])
    if (episode % 10)==0:
        print('saving weights')
        agent.brain.save_weights('PG_4frames_weights.h5')
    with open("PG_4frames.txt", "a") as myfile:
            myfile.write("episode " + str(episode) + "\t" +
                         "loss " + str(loss) + "\t" +
                         " raw_d_rw " + str(agent.sum_disc_raw) + "\t" +
                         " raw_rw " + str(np.sum(agent.sum_raw_rew)) + "\t" +
                         " avg_up " + str(avg_up) + "\t" +
                         " game_len " + str(len(d_rewards)) + "\t" +
                         "\n")

EPISODE 0
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80)
state (80, 80) state4 (80, 80