In [1]:
import gym

In [7]:
env=gym.make("MountainCar-v0")

In [8]:
env.reset()

array([-0.40340416,  0.        ])

# playing using random strategy

In [11]:
env.reset()
for e in range(20):
    observations=env.reset()
    max_pos=observations[0]
    for t in range(200):
        env.render()
        observations,reward,done,info=env.step(env.action_space.sample())
        max_pos=max(max_pos,observations[0])
        #print(reward)
        if done:
            break
    print("game over {}/{}: high score:{}".format(e+1,20,max_pos))
env.close()

game over 1/20: high score:-0.4087496643632926
game over 2/20: high score:-0.3300978613732684
game over 3/20: high score:-0.46269613731745507
game over 4/20: high score:-0.4843656601112598
game over 5/20: high score:-0.3977204352197913
game over 6/20: high score:-0.3901900601228825
game over 7/20: high score:-0.47584323408271884
game over 8/20: high score:-0.3717156629297182
game over 9/20: high score:-0.4084105849086924
game over 10/20: high score:-0.3952660301221075
game over 11/20: high score:-0.3221162835422451
game over 12/20: high score:-0.431525420616796
game over 13/20: high score:-0.45194791990148303
game over 14/20: high score:-0.42148519433993115
game over 15/20: high score:-0.43651691236075424
game over 16/20: high score:-0.46806935626579016
game over 17/20: high score:-0.48033068977230436
game over 18/20: high score:-0.35310134483583744
game over 19/20: high score:-0.3271648077956955
game over 20/20: high score:-0.42562464107473424


# Learning Part

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from keras.models import *
from keras.layers import *
from keras.optimizers import Adam
import random

Using TensorFlow backend.


# Creating an agent class

In [29]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.memory=deque(maxlen=2000)
        self.gamma=0.95
        self.epsilon=1.0
        self.epsilon_decay=0.90
        self.epsilon_min=0.01
        self.model=self.create_model()
    
    def create_model(self):
        model=Sequential()
        model.add(Dense(32,input_shape=(2,),activation='relu'))
        model.add(Dense(64,activation='relu'))
        model.add(Dense(3,activation='linear'))
        model.compile(loss='mse',optimizer='Adam')
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def act(self,state):
        if np.random.rand()<self.epsilon:
            return random.randrange(self.action_size)
        act_values=self.model.predict(state)
        return np.argmax(act_values[0])
    
    def train(self,batch_size=32):
        minibatch=random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            if not done:
                target=reward+self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f=self.model.predict(state)
            target_f[0][action]=target
            self.model.fit(state,target_f,epochs=1,verbose=0)
            if self.epsilon>self.epsilon_min:
                self.epsilon*=self.epsilon_decay
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)


In [39]:
n_episodes=100
output_dir="mountain_car/"
state_size=2
action_size=3
batch_size=32

In [40]:
agent=Agent(state_size,action_size)


# Training the agent on generated data

* Note that we have tweaked the reward. We have made the reward such that the agent tries to drive forward when velocity is positive and drives backwards when the velocity is negative.
* The reward returned by gym is always -1 if the car is not able to reach the flag which is situated at the position +0.5. Thus it would have taken a lot of exploration if we were to use that reward to frame our policy

In [41]:
for e in range(n_episodes):
    state=env.reset()
    state=np.reshape(state,[1,state_size])
    flag=False
    max_pos=(state[0][0])
    flag=False
    for time in range(200):
        env.render()
        action=agent.act(state)
        next_state,reward,done,info=env.step(action)
        next_state=np.reshape(next_state,[1,state_size])
        max_pos=max(max_pos,(next_state[0][0]))
        reward=(action-1)*state[0][1]
        agent.remember(state,action,reward,next_state,done)
        state=next_state
        if state[0][0]>=0.5:
            flag=True
            break
        if done:
            break
    if flag:
        print("Game {}/{}: Completed Successfully".format(e+1,n_episodes))
    else:
        print("Game {}/{}: Unsuccessful attempt, high score: {}".format(e+1,n_episodes,max_pos))
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    if e%10==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
env.close()

Game 1/100: Unsuccessful attempt, high score: -0.2463605590327244
Game 2/100: Unsuccessful attempt, high score: -0.460094356573934
Game 3/100: Unsuccessful attempt, high score: -0.46296546958087514
Game 4/100: Unsuccessful attempt, high score: -0.4918548234129807
Game 5/100: Unsuccessful attempt, high score: -0.5731088618689911
Game 6/100: Unsuccessful attempt, high score: -0.3588716876777703
Game 7/100: Unsuccessful attempt, high score: -0.48005745065901567
Game 8/100: Unsuccessful attempt, high score: -0.4065040319531399
Game 9/100: Unsuccessful attempt, high score: -0.4450758928501298
Game 10/100: Unsuccessful attempt, high score: 0.14496182604197166
Game 11/100: Unsuccessful attempt, high score: -0.484008043394069
Game 12/100: Unsuccessful attempt, high score: -0.40446890258771373
Game 13/100: Unsuccessful attempt, high score: -0.5074997430873608
Game 14/100: Unsuccessful attempt, high score: -0.3457247413046742
Game 15/100: Unsuccessful attempt, high score: -0.48137241762271077
Ga

## The agent is able to complete the game more often as the training progresses. We have successully implemented an agent which plays the mountain car game for us