In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import deque

In [4]:
a = np.arange(8).reshape(2,2,2) 
b = np.arange(8)
print(b)

[0 1 2 3 4 5 6 7]


In [20]:
'''
class QSolver simply initiates all the parameters and runs the agent. 
-tryNum method to discretize continuous values of cartpole observations. 
-updateEpsilon() and updateAlpha() functions reduces the value of epsilon and alpha over episodes to
increase learning. 
- chooseAction() method chooses an action with respect to epsilon value as random or state that hasmaximum value on the 
Q table.
- updateQTable() updates the q table with respect to the action that has been taken and its reward.
-plotScores() is a helper method to plot the final reward/episode figure and saves it.
- runEpisodes() runs the episodes and learns how to make stick stable.
'''

class QSolver():
    def __init__(self,alpha = 0.03,epsilon = 1, gamma = 0.999,time_steps = 500,number_of_episodes=1000, position_bucket=2, velocity_bucket=2, angle_bucket = 10, ang_vel_bucket = 16):
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        self.time_steps = time_steps
        self.number_of_episodes = number_of_episodes
        self.position_bucket =  position_bucket
        self.velocity_bucket = velocity_bucket 
        self.angle_bucket = angle_bucket
        self.ang_vel_bucket = ang_vel_bucket
        self.env = gym.make('CartPole-v0')
        self.Q = np.zeros((self.position_bucket,)+(self.velocity_bucket,)+(self.angle_bucket,)+(self.ang_vel_bucket,)+(self.env.action_space.n,))
        self.boxes = (self.position_bucket,)+(self.velocity_bucket,)+(self.angle_bucket,)+(self.ang_vel_bucket,)
        self.scores = []
        self.A = np.empty(shape=(4,2))
        self.A.fill(0)
        self.b = np.empty(shape=(1,2))
        self.b.fill(0)
        self.derivative = 0
        
    #compute out value of the observation
    def computeStateActionValues(self,obs):
        out = np.matmul(obs,self.A) + self.b
        return out

    
    #update epsilon each episode
    #I took the equation from the website: https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/
    def updateEpsilon(self,t,epsilon):
        epsilon = epsilon * 1/(1 + 0.0001 * (t + 1))
        return epsilon
    
    #update alpha each episode
    #I took the equation from the website: https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1
    def updateAlpha(self,t,alpha):
        k = 0.1
        alpha = alpha * math.exp(-k*t)
        return alpha

    #choose an action with respect to epsilon
    #if random value is smaller than epsilon, return a random state, to break a loop and increase the learning
    #else return the state that has maximum reward
    def chooseAction(self, state, epsilon):
        if np.random.random() <= epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(state[0])
    
    #compute the first derivative of L with respect to out, dL/d(out) to update both A and b values
    def computeDerivativeOfLwrtOut(self,reward,current_action,old_state_action_value ,old_observation,new_observation):
        #compute out value of next observations
        actionValueFromNewObservation = self.computeStateActionValues(new_observation)        
        maxActionFromNewObservation = np.argmax(actionValueFromNewObservation[0])
        #compute the derivative of L with respect to out
        derivative= -(reward + self.gamma * actionValueFromNewObservation[0][maxActionFromNewObservation]) + old_state_action_value[0][current_action]
        self.derivative = derivative
        return derivative
    
    #update A value with respect to derivate of L and derivative of out with respect to A value
    def updateAvalue(self,reward,current_action,old_state_action_value,old_observation,new_observation,alpha):
        computeDerivative=self.computeDerivativeOfLwrtOut(reward,current_action,old_state_action_value,old_observation,new_observation)
        derivative = np.matmul(computeDerivative.reshape(1,1),old_observation.reshape(1,4))
        self.A[:,current_action] = self.A[:,current_action] - alpha * derivative
        

    #update b value with respect to derivate of L 
    def updateBvalue(self,reward,current_action,old_state_action_value,old_observation,new_observation,alpha):
        derivative = self.computeDerivativeOfLwrtOut(reward,current_action,old_state_action_value,old_observation,new_observation)

        self.b[:,current_action] = self.b[:,current_action] - alpha * derivative
    
    #plot episode / total reward
    def plotScores(self):
        plt.plot(self.scores)
        plt.xlabel('episode')
        plt.ylabel('total reward')
        plt.savefig('RewardOverEpisodes.png')
        plt.show()

        
        
    def runEpisodes(self):
        #initializing reward that has been taken, it will keep the total reward of an episode
        NumberOfConsecutiveAcceptedReward = 0
        epsilon  =self.epsilon
        alpha = self.alpha
        for i in range(0,self.number_of_episodes):
            #starting a new episode via resetting and getting the first observation
            current_observation =  self.env.reset()
            #get current action values with respect to observation
            current_state = self.computeStateActionValues(current_observation)
            #update epsilon and alpha values for the episode 
            epsilon = self.updateEpsilon(i,epsilon)
            alpha = self.updateAlpha(i,alpha)
            #resetting reward
            numberOfStepsGetReward = 0
            print("episode: " + str(i))
            #running an episode it will stop after 500 timestep
            for numberOfSteps in range(self.time_steps):
                #choose an action with respect to previous observation
                action = self.chooseAction(current_state,epsilon)
                #get current observation and reward gotten from the action that has selectted
                new_observation,reward , _,_ = self.env.step(action)
                #discretize the new observation as new state
                new_state =self.computeStateActionValues(new_observation)
                self.updateAvalue(reward,action,current_state,current_observation,new_observation,alpha)
                self.updateBvalue(reward,action,current_state,current_observation,new_observation,alpha)
                #make new state current state to take an action
                current_state = new_state
                #collect the reward, if the reward is 0 it will remain same
                numberOfStepsGetReward += reward
            #keep the total score of the reward
            self.scores.append(numberOfStepsGetReward)
            #save consecutive episodes,if the total reward is over 400
            if numberOfStepsGetReward >= 400:
                NumberOfConsecutiveAcceptedReward += 1
            else:
                NumberOfConsecutiveAcceptedReward = 0
            #if 30 consecutive episodes that has a reward over 400 , stop
            if NumberOfConsecutiveAcceptedReward >= 30:
                print("converged")
                print(i)
                break
        #after learning plot the results
        self.plotScores()
                

In [None]:
if __name__ == "__main__":
    
    # we make an agent to run the episode
    agent = QSolver()
    #then we simply run the agent
    agent.runEpisodes()
    #agent.deneme()

episode: 0
episode: 1
episode: 2
episode: 3
episode: 4
episode: 5
episode: 6
episode: 7
episode: 8
episode: 9
episode: 10
episode: 11
episode: 12
episode: 13
episode: 14
episode: 15
episode: 16
episode: 17
episode: 18
episode: 19
episode: 20
episode: 21
episode: 22
episode: 23
episode: 24
episode: 25
episode: 26
episode: 27
episode: 28
episode: 29
episode: 30
episode: 31
episode: 32
episode: 33
episode: 34
episode: 35
episode: 36
episode: 37
episode: 38
episode: 39
episode: 40
episode: 41
episode: 42
episode: 43
episode: 44
episode: 45
episode: 46
episode: 47
episode: 48
episode: 49
episode: 50
episode: 51
episode: 52
episode: 53
episode: 54
episode: 55
episode: 56
episode: 57
episode: 58
episode: 59
episode: 60
episode: 61
episode: 62
episode: 63
episode: 64
episode: 65
episode: 66
episode: 67
episode: 68
episode: 69
episode: 70
episode: 71
episode: 72
episode: 73
episode: 74
episode: 75
episode: 76
episode: 77
episode: 78
episode: 79
episode: 80
episode: 81
episode: 82
episode: 83
ep

In [None]:
    def deneme(self):
        current_observation =  self.env.reset()
        current_state = self.computeStateActionValues(current_observation)
        #print(current_state)
        #print(type(current_state))
        epsilon = self.updateEpsilon(0)
        #print(self.chooseAction(current_state,epsilon))
        action = self.chooseAction(current_state,epsilon)
        new_observation,reward , _,_ = self.env.step(action)
        current_state2 = self.computeStateActionValues(new_observation)
        #print(current_state2)
        #print(self.chooseAction(current_state,epsilon))
        #print(new_observation)
        #print(reward)
        #print(" A old")
        print(self.A)
        self.updateAvalue(reward,action,current_state,current_observation,new_observation)
        #print(" A new")
        print(self.A)        
        print(" b old")
        print(self.b)
        self.updateBvalue(reward,action,current_state,current_observation,new_observation)
        print(" b new")
        print(self.b)  