In [0]:
import gym
import random
import numpy as np
from keras.models     import Sequential
from keras.layers     import Dense
from keras.optimizers import Adam

In [0]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 70
intial_games = 20000

In [0]:
def play_a_random_game_first():
    for step_index in range(goal_steps):
        #env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()

In [0]:
play_a_random_game_first()

Step 0:
action: 0
observation: [-0.02511281 -0.19154075  0.03282834  0.32795913]
reward: 1.0
done: False
info: {}
Step 1:
action: 1
observation: [-0.02894362  0.00309883  0.03938753  0.04580686]
reward: 1.0
done: False
info: {}
Step 2:
action: 1
observation: [-0.02888165  0.1976345   0.04030366 -0.23419339]
reward: 1.0
done: False
info: {}
Step 3:
action: 1
observation: [-0.02492896  0.39215811  0.03561979 -0.51389578]
reward: 1.0
done: False
info: {}
Step 4:
action: 1
observation: [-0.0170858   0.58676077  0.02534188 -0.79514461]
reward: 1.0
done: False
info: {}
Step 5:
action: 0
observation: [-0.00535058  0.39130035  0.00943899 -0.4945984 ]
reward: 1.0
done: False
info: {}
Step 6:
action: 1
observation: [ 2.47542739e-03  5.86287922e-01 -4.52980849e-04 -7.84291706e-01]
reward: 1.0
done: False
info: {}
Step 7:
action: 1
observation: [ 0.01420119  0.7814161  -0.01613881 -1.07711711]
reward: 1.0
done: False
info: {}
Step 8:
action: 0
observation: [ 0.02982951  0.58651101 -0.03768116 -0.7

In [0]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, done, info = env.step(action)
            
            #print("Step {}:".format(step_index))
            #print("action: {}".format(action))
            #print("observation: {}".format(observation))
            #print("reward: {}".format(reward))
            #print("done: {}".format(done))
            #print("info: {}".format(info))

            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            #print("WIIIIIIIN: {}".format(score))
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])
        
        env.reset()

    print(accepted_scores)
    print(training_data)
    
    return training_data

In [0]:
training_data = model_data_preparation()

[90.0, 102.0, 70.0, 70.0, 127.0, 90.0, 83.0, 71.0, 77.0, 72.0, 105.0, 94.0, 70.0, 72.0, 144.0, 75.0, 71.0, 86.0, 74.0, 81.0, 78.0, 83.0, 75.0, 95.0, 74.0, 71.0, 77.0, 73.0, 91.0, 72.0, 73.0, 92.0, 71.0, 75.0, 72.0, 71.0, 114.0, 72.0, 75.0, 87.0, 85.0, 72.0, 113.0, 184.0, 83.0, 75.0, 93.0, 74.0, 83.0, 70.0, 147.0, 86.0, 73.0, 72.0, 71.0, 102.0, 73.0, 86.0, 77.0, 106.0, 133.0, 72.0, 79.0, 72.0, 109.0, 90.0, 76.0, 70.0, 82.0, 70.0, 81.0, 84.0, 73.0, 89.0, 85.0, 82.0, 75.0, 73.0, 77.0, 89.0, 77.0, 77.0, 89.0, 73.0, 91.0, 80.0, 88.0, 73.0, 81.0, 73.0, 71.0, 80.0, 79.0, 92.0, 70.0, 94.0, 74.0, 71.0, 97.0, 70.0, 86.0, 91.0, 102.0, 84.0, 71.0, 100.0, 73.0, 77.0, 73.0, 73.0, 70.0, 91.0, 71.0, 82.0, 83.0, 70.0, 70.0, 70.0, 91.0, 82.0, 71.0, 89.0, 77.0, 71.0, 80.0, 105.0, 72.0, 76.0, 78.0, 96.0, 70.0, 76.0, 74.0, 72.0, 97.0, 99.0, 79.0, 74.0, 71.0]
[[array([-0.03999962,  0.15791198,  0.01979263, -0.24280225]), [1, 0]], [array([-0.03684138, -0.03748701,  0.01493659,  0.05605749]), [1, 0]], [array(

In [0]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())

    return model

In [0]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=10)
    return model

In [0]:
trained_model = train_model(training_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
scores = []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = np.array([])
    for step_index in range(goal_steps):
        # Décommenter la ligne suivante pour voir le bot en action.
        # env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break

    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))

[351.0, 434.0, 500.0, 500.0, 346.0, 500.0, 500.0, 500.0, 500.0, 183.0, 500.0, 500.0, 500.0, 500.0, 379.0, 500.0, 500.0, 500.0, 500.0, 193.0, 439.0, 500.0, 500.0, 256.0, 342.0, 500.0, 500.0, 448.0, 500.0, 500.0, 429.0, 500.0, 189.0, 500.0, 417.0, 500.0, 500.0, 500.0, 500.0, 500.0, 500.0, 217.0, 366.0, 500.0, 500.0, 214.0, 364.0, 500.0, 435.0, 218.0, 500.0, 500.0, 363.0, 500.0, 430.0, 500.0, 500.0, 500.0, 280.0, 500.0, 500.0, 500.0, 500.0, 417.0, 200.0, 500.0, 500.0, 436.0, 500.0, 500.0, 500.0, 364.0, 500.0, 500.0, 500.0, 500.0, 425.0, 429.0, 500.0, 500.0, 500.0, 500.0, 500.0, 175.0, 192.0, 500.0, 500.0, 193.0, 500.0, 500.0, 500.0, 500.0, 430.0, 500.0, 500.0, 223.0, 500.0, 185.0, 193.0, 500.0]
Average Score: 436.55
choice 1:0.49719390676898406  choice 0:0.5028060932310159
