# Open.ai GYM Mountain Car v0 

[hidden_link](https://blog.tanka.la/2018/10/19/solving-curious-case-of-mountaincar-reward-problem-using-openai-gym-keras-tensorflow-in-python/)

In [1]:
import gym
import numpy as np
import keras as K
import random


In [2]:
env = gym.make("MountainCar-v0")
env.reset()
goal_steps = 200
score_requirement = -198
initial_games = 10000

In [3]:
def play_a_random_game_first():
    for step_index in range(goal_steps):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()
    env.close()

In [4]:
play_a_random_game_first()

Step 0:
action: 0
observation: [-0.4824299  -0.00131779]
reward: -1.0
done: False
info: {}
Step 1:
action: 2
observation: [-0.48305567 -0.00062577]
reward: -1.0
done: False
info: {}
Step 2:
action: 0
observation: [-0.48498477 -0.00192909]
reward: -1.0
done: False
info: {}
Step 3:
action: 1
observation: [-0.48720282 -0.00221805]
reward: -1.0
done: False
info: {}
Step 4:
action: 0
observation: [-0.4906933  -0.00349048]
reward: -1.0
done: False
info: {}
Step 5:
action: 1
observation: [-0.49443017 -0.00373687]
reward: -1.0
done: False
info: {}
Step 6:
action: 0
observation: [-0.49938552 -0.00495536]
reward: -1.0
done: False
info: {}
Step 7:
action: 2
observation: [-0.50352232 -0.0041368 ]
reward: -1.0
done: False
info: {}
Step 8:
action: 1
observation: [-0.5078096  -0.00428728]
reward: -1.0
done: False
info: {}
Step 9:
action: 0
observation: [-0.51321525 -0.00540565]
reward: -1.0
done: False
info: {}
Step 10:
action: 2
observation: [-0.51769876 -0.00448352]
reward: -1.0
done: False
info: {

## Creating data from action steps

In [5]:
def action_data_preparation():
    training_data = []
    scores = []
    
    for index in range(initial_games):
        score = 0
        memory = []
        prev_observation = []
        
        for step_index in range(goal_steps):
            action = random.randrange(0,3)
            observation, reward, done, info = env.step(action)
            
            if len(prev_observation) > 0:
                memory.append([prev_observation, action])
                
            prev_observation = observation
            
            if observation[0] > -0.2:
                reward = 1
            
            score += reward
            if done:
                break
                
        if score >= score_requirement:
            scores.append(score)

            for data in memory:
                if data[1] == 0:
                    output = [1,0,0]
                elif data[1] == 1:
                    output = [0,1,0]
                elif data[1] == 2:
                    output = [0,0,1]
                training_data.append([data[0], output])

        env.reset()
        
    env.close()
    print(scores)
    
    return training_data
            
        

In [6]:
train_data = action_data_preparation()

[-180.0, -174.0, -184.0, -182.0, -198.0, -154.0, -178.0, -176.0, -190.0, -182.0, -184.0, -186.0, -194.0, -192.0, -182.0, -190.0, -186.0, -176.0, -184.0, -188.0, -176.0, -186.0, -168.0, -188.0, -186.0, -180.0, -174.0, -192.0, -178.0, -170.0, -194.0, -188.0, -184.0, -176.0, -180.0, -172.0]


In [7]:
def build_model(input_size, output_size):
    model = K.models.Sequential([
        K.layers.Dense(128, input_dim=input_size, activation='relu'),
        K.layers.Dense(52, activation='relu'),
        K.layers.Dense(output_size, activation='linear'),
    ])
    model.compile(loss='mse', optimizer=K.optimizers.Adam())
    
    return model

In [8]:
def train(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    Y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    
    model = build_model(len(X[0]), len(Y[0]))
    
    model.fit(X, Y, epochs= 10)
    
    return model

In [9]:
trained_model = train(train_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
def prepare():
    scores = []
    choices = []
    
    for each in range(100):
        score = 0
        memory = []
        prev_obs = []
        
        for step in range(goal_steps):
            env.render()
            
            if len(prev_obs) == 0:
                action = random.randrange(0, 2)
            
            else:
                action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
                
            choices.append(action)
            
            new_obs, reward, done, info = env.step(action)
            
            prev_obs = new_obs
            
            memory.append([new_obs, action])
            score += reward
            
            if done:
                break
                
    env.reset()
    
    
    scores.append(score)
    env.close()
    print(scores)
    print('Average Score:',sum(scores)/len(scores))
    print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices),choices.count(2)/len(choices)))
    
    

In [13]:
prepare()

[-1.0]
Average Score: -1.0
choice 1:0.26339285714285715  choice 0:0.39285714285714285 choice 2:0.34375
