In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import gym

# 簡化版的deep q learning： 僅把q-table換成類神經網路
＃在Reinforcement Learning中我們都利用Q_target（把它當作真實的Q值)來更新神經網路的weights。
公式： Q_target = Q(s) + alpha*( R(s,a) + Q(s_)*Gamma-Q(s) )  
(s_ 代表下一步的狀態，下一步的狀態有很多種可能，我們這裡選擇的s_是能得到最大Q的狀態，這種方法是比較agressive的方法，還有另外一種是SARSA有興趣可以自尋搜尋一下； alpha這邊我們設定為1）
因此公式就變成 Q_target = R(s,a) + max(Q(s_,a))*Gamma  

In [None]:
class QLearning:
    
    def __init__(
        self, 
        n_actions, #動作的維度，例如上下左右就有四維
        n_states, #用來描述狀態的維度，例如馬力歐在平面上就是二維
        gamma = 0.9, #遠見程度
        epsilon = 0.9,  #保守程度，越大就越容易用Q值大小來採取行動；越小則越容易產生隨機行動
        learning_rate = 0.001 #神經網路的更新率
    ):
    
        self.n_actions = n_actions
        self.n_states = n_states
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = learning_rate
        
        tf.reset_default_graph() ## 重新 build graph 需要跑這行
        self.sess = tf.Session() #宣告session
        #輸入current state
        self.state_input = tf.placeholder(shape = [None, self.n_states], 
                                          name = 'input',
                                          dtype = tf.float32)
        #q_target = R(s, action) + Q(s_)*Gamma 
        self.q_target = tf.placeholder(shape = [None, self.n_actions], 
                                       name = 'q_target',
                                       dtype = tf.float32)
        #搭建神經網路
        with tf.variable_scope('Q_table'):
            self.q_eval = self.build_network('net_eval') 
        
        # 管理神經網路的parameters
        self.Qnet_eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Q_table/net_eval')
        
        #計算q_target和q_eval的mse來更新神經網路的參數
        self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
        self.train = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss, var_list=self.Qnet_eval_params)
        
        self.sess.run(tf.global_variables_initializer()) #將神經網路初始化
        
            
    def build_network(self, scope): 
        with tf.variable_scope(scope):
            x_h1 = tf.layers.dense(inputs = self.state_input, units = 5, activation = tf.nn.tanh)        
            x_h2 = tf.layers.dense(inputs = x_h1, units = 5, activation = tf.nn.tanh)             
        return tf.layers.dense(inputs = x_h2, units = self.n_actions) #輸出‘不同動作’對應的Q值 
               
 
            
    def choose_action(self, current_state):
        """
        利用epsilon來控制探索的隨機程度，通常探索初期會給比較小的epsilon增加行為的隨機程度，
        然後隨著遊戲的進行慢慢增加epsilon。不過由於這裡的遊戲較簡單，就不做此設定。
        """
        if np.random.uniform() < self.epsilon: 
            #選擇產生估計Q值較大的行動
            q_eval = self.sess.run(self.q_eval, feed_dict={self.state_input: current_state[np.newaxis, :]})
            self.action = np.argmax(q_eval)
        else:
            #採取隨機行動
            self.action = np.random.randint(0, self.n_actions)
        return self.action
    
    def learn(self, current_state, reward, next_state):
        
        #算出實際q值並用此更新神經網路參數
        q_eval = self.sess.run(self.q_eval, feed_dict={self.state_input: current_state[np.newaxis, :]})
        q_eval_next = self.sess.run(self.q_eval, feed_dict={self.state_input: next_state[np.newaxis, :]})
        q_target = q_eval.copy()
        q_target[:, self.action] = reward + self.gamma*q_eval_next.max()
        _, self.cost = self.sess.run([self.train, self.loss], feed_dict={self.state_input: current_state[np.newaxis, :],
                                                                            self.q_target: q_target})
    def model_save(self, model_name):
        
        saver = tf.train.Saver()
        saver.save(self.sess, "saved_models/{}.ckpt".format(model_name))
    
    def model_restore(self, model_name):
        
        saver = tf.train.Saver()
        saver.restore(self.sess, "saved_models/{}.ckpt".format(model_name))
    

# 提示
1. 在/Users/Yourname/anaconda3/lib/python3.6/site-packages/gym/envs底下可以找到Gym AI底下所有遊戲的文件，其中__init__.py定義了呼叫各個遊戲的名稱，例如moutain car你就得用gym.make(‘MountainCar-v0’)，另外和遊戲相關的py檔在envs/classic_control的資料夾內。我們接下來要玩的是離散版本的不是連續版的喔～，另外如果您找不到的話我們也將檔案拉出來放在gym document供大家參考。

2. 在/Users/Yourname/anaconda3/lib/python3.6/site-packages/gym/envs底下可以找到Gym AI底下所有遊戲的文件，其中__init__.py定義了呼叫各個遊戲的名稱，例如moutain car你就得用gym.make(‘MountainCar-v0’)，另外和遊戲相關的py檔在envs/classic_control的資料夾內。我們接下來要玩的是離散版本的不是連續版的喔～，另外如果您找不到的話我們也將檔案拉出來放在gym document供大家參考。

In [None]:
env = gym.make('MountainCar-v0')
"""
執行gym ai的遊戲時也請加下面這兩行
"""
env = env.unwrapped
env.seed(1)

RL = QLearning(n_actions = 3, 
               n_states = 2,
               gamma = 0.99,
               epsilon = 0.9,
               learning_rate = 0.01
               )
reward_record = []
step_record = []
for episode in range(100):
    # initial environment並給出起始的state
    total_reward = 0
    step = 0
    current_state = env.reset()

    while True:

        # 產生環境視窗
        env.render()
        # RL choose action based on current state
        action = RL.choose_action(current_state)

        """
        Gym ai 的遊戲step都會output 4個值，分別為下一狀態、
        獎勵、回合結束與否和info，不過info我們用不到因此不用管
        他

        """
        # RL take action and get next state and reward
        next_state, reward, done, _ = env.step(action)

        position, velocity = next_state
        reward = abs(position + 0.5) + reward

        total_reward+= reward

        RL.learn(current_state, reward, next_state)
        
        step+=1
        # break while loop when end of this episode
        if done:
            print('Episode{} Total Step:{} Total Reward:{}'.format(episode, step, total_reward))
     
            reward_record.append(total_reward)
            step_record.append(step)
            break
        # swap state
        current_state = next_state  
    env.close()

Episode0 Total Step:25918 Total Reward:-21167.473805910824
Episode1 Total Step:24770 Total Reward:-20165.215424436952
Episode2 Total Step:22819 Total Reward:-17925.791477029914
Episode3 Total Step:18366 Total Reward:-14549.468423594855
Episode4 Total Step:19822 Total Reward:-16309.28179291167
Episode5 Total Step:22232 Total Reward:-17372.687209061103
Episode6 Total Step:18697 Total Reward:-13751.687729503503
Episode7 Total Step:22556 Total Reward:-18434.74256329547
Episode8 Total Step:18686 Total Reward:-15042.218447951434
Episode9 Total Step:20863 Total Reward:-16868.16821927978
Episode10 Total Step:20128 Total Reward:-16421.6224800781
Episode11 Total Step:23366 Total Reward:-19130.365145711996
Episode12 Total Step:23997 Total Reward:-19225.82150423933
Episode13 Total Step:16179 Total Reward:-12031.246719531173
Episode14 Total Step:20246 Total Reward:-16062.964810347185
Episode15 Total Step:24704 Total Reward:-19954.269132214187
Episode16 Total Step:20063 Total Reward:-16521.535116319

# 玩個幾回就發現total step都沒下降，不玩了～

In [None]:
reward_result = pd.concat(reward_record)
reward_result.columns = ['Q_Agent']
step_result = pd.concat(step_record)
step_result.columns = ['Q_Agent']