In [None]:
'''
    File name: gym API FronzenLake8x8.ipynb
    Author: Yue Liang
    Date last modified: 5/16/2021
    Python Version: 3.8
    TensorFlow 2.4
'''

<h1>Environment variables<h1>

In [1]:
import gym
import pandas as pd
import numpy as np
import time
import math
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [2]:
env = gym.make('CartPole-v1')

In [3]:
env.seed(1000)
random.seed(1000)

<h1>Define CNN<h1>

In [4]:
def generate_dataset(reward_min, EPISODES):
    
    start = time.time()
    print('program start...')
    print()
    
    movements_list = []
    dataset_count = 0
    push_left = [1, 0]
    push_right = [0, 1]
    
    
    for episode in range(EPISODES):  
        
        reward_current = 0
        movements_current = []
        env.reset()
        action = env.action_space.sample()
        
        if episode % 50000 == 0: 
            print("Episode: " + str(episode))
        
        while True:
            observation, reward, done, info = env.step(action)
            action = env.action_space.sample()
            movements_current.append(
                np.hstack((observation, push_left if action == 0 else push_right))
            )
            if done:
                break
            reward_current += reward
            
        if reward_current >= reward_min:
            dataset_count = dataset_count + 1
            movements_list.extend(movements_current)
            print('Achieved minimum reward goal, episode #'+ str(episode+1) +' moevements saved!')
            
            
        if dataset_count > 4:
            print()
            print('Recorded 5 movements! ')
            break
        
    dataset = pd.DataFrame(
        movements_list,
        columns=['Cart Position', 
                 'Cart Velocity', 
                 'Pole Angle',
                 'Pole Angular Velocity',
                 'Push cart to the left',
                 'Push cart to the right', 
                ])
        
    dataset['Push cart to the left'] = dataset['Push cart to the left'].astype(int)
    dataset['Push cart to the right'] = dataset['Push cart to the right'].astype(int)
    
    print()
    end = time.time()
    print('program end...')
    print()
    print('time cost: ')
    print(end - start, 'seconds')

    return dataset
    

In [5]:
def CNN(dataset, epoch):
    
    start = time.time()
    print('program start...')
    print()
    
    model = Sequential()
    model.add(Dense(200, input_dim=4, activation='relu'))
    model.add(Dense(200,  activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100,  activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100,  activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50,  activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2,  activation='sigmoid'))    
    model.compile(optimizer='adam', loss='categorical_crossentropy')   
    model.summary()
    
    model.fit(
        dataset[['Cart Position','Cart Velocity','Pole Angle','Pole Angular Velocity']],
        dataset[['Push cart to the left','Push cart to the right']],
        epochs=epoch
    )
    
    print()
    end = time.time()
    print('program end...')
    print()
    print('time cost: ')
    print(end - start, 'seconds')
    
    return model

In [6]:
def get_result_CNN(model, EPISODES = 100):
    
    start = time.time()
    print('program start...')
    print()
    
    reward_sum = 0
    
    for episode in range (EPISODES):
        reward_current = 0
        observation = env.reset()
    
        while True:
            render = env.render()
            action_prediction = model.predict(observation.reshape(1, 4))
            action = np.argmax(action_prediction)
            observation, reward, done, info = env.step(action)            
            if done:
                reward_current += 1
                break                
            reward_current += 1
        
        reward_sum = reward_sum + reward_current
        print('Episode ' + str(episode+1) + ' score is ' + str(reward_current))
        
        if reward_current > 200: 
            print()
            print('Achieved the goal of 200 frames!')
            break
    
    print()
    end = time.time()
    print('program end...')
    print()
    print('time cost: ')
    print(end - start, 'seconds')

<h1>Define Q Learning<h1>

In [7]:
def generate_state(n):
    state = n/np.array([0.25, 0.25, 0.01, 0.1])+ np.array([15,10,1,10])
    return tuple(state.astype(np.int))

In [8]:
def get_result_QLearning():
    
    start = time.time()
    print('program start...')
    print()
    
    EPISODES = 100000
    epsilon = 1
    total = 0
    pervious = 0
    q_table = np.random.uniform(low=0, high=1, size=([150, 150, 100, 100] + [2]))
    
    for episode in range(EPISODES + 1): 
        done = False
        discrete_state = generate_state(env.reset()) 
        episode_reward = 0 
        if episode % 1000 == 0: 
            print("Episode: " + str(episode))
        while not done: 
            if np.random.random() > epsilon:
                action = np.argmax(q_table[discrete_state])
            else:
                action = np.random.randint(0, env.action_space.n) 
            new_state, reward, done, _ = env.step(action) 
            episode_reward += reward 
            new_discrete_state = generate_state(new_state)
            if episode % 1000 == 0: 
                env.render()
            if not done: 
                max_future_q = np.max(q_table[new_discrete_state])
                current_q = q_table[discrete_state + (action,)]
                new_q = (1 - 0.1) * current_q + 0.1 * (reward + 0.95 * max_future_q)
                q_table[discrete_state + (action,)] = new_q
            discrete_state = new_discrete_state
        if epsilon > 0.05:
            if episode_reward > pervious and episode > 10000:
                epsilon = math.pow(0.9999, episode - 10000)
        total += episode_reward 
        pervious = episode_reward
        if episode % 1000 == 0: 
            mean_reward = total / 1000
            print("Mean Reward: " + str(mean_reward))
            total = 0
            if mean_reward > 200:
                print()
                print('Achieved goal of 200 frames on average!')
                print('Mean Reward: ' + str(mean_reward))
                print('Episode spent: ' + str(episode))
                break
                
    print()
    end = time.time()
    print('program end...')
    print()
    print('time cost: ')
    print(end - start, 'seconds')

<h1>Get CNN result<h1>

In [9]:
print("Generating dataset...")
dataset = generate_dataset(reward_min=145, EPISODES=1000000)

Generating dataset...
program start...

Episode: 0
Achieved minimum reward goal, episode #431 moevements saved!
Episode: 50000
Episode: 100000
Achieved minimum reward goal, episode #126787 moevements saved!
Episode: 150000
Achieved minimum reward goal, episode #193521 moevements saved!
Achieved minimum reward goal, episode #197027 moevements saved!
Episode: 200000
Episode: 250000
Episode: 300000
Episode: 350000
Episode: 400000
Episode: 450000
Achieved minimum reward goal, episode #457557 moevements saved!

Recorded 5 movements! 

program end...

time cost: 
193.77978587150574 seconds


In [10]:
dataset.shape

(821, 6)

In [11]:
dataset

Unnamed: 0,Cart Position,Cart Velocity,Pole Angle,Pole Angular Velocity,Push cart to the left,Push cart to the right
0,0.031205,-0.182193,-0.028476,0.318812,1,0
1,0.027561,-0.376898,-0.022099,0.602380,0,1
2,0.020023,-0.181474,-0.010052,0.302819,1,0
3,0.016393,-0.376451,-0.003995,0.592315,0,1
4,0.008864,-0.181273,0.007851,0.298376,1,0
...,...,...,...,...,...,...
816,0.473483,-0.255660,0.107855,1.100668,1,0
817,0.468369,-0.452023,0.129868,1.425149,1,0
818,0.459329,-0.648489,0.158371,1.755440,1,0
819,0.446359,-0.845013,0.193480,2.092903,0,1


In [12]:
CNN = CNN(dataset, epoch=100)

program start...

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               1000      
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 10

In [13]:
get_result_CNN(model=CNN)

program start...

Episode 1 score is 179
Episode 2 score is 201

Achieved the goal of 200 frames!

program end...

time cost: 
16.744426488876343 seconds


In [14]:
env.close()

<h1>Get Q Learning result<h1>

In [15]:
get_result_QLearning()

program start...

Episode: 0
Mean Reward: 0.012
Episode: 1000
Mean Reward: 21.723
Episode: 2000
Mean Reward: 22.636
Episode: 3000
Mean Reward: 22.188
Episode: 4000
Mean Reward: 22.388
Episode: 5000
Mean Reward: 22.077
Episode: 6000
Mean Reward: 22.191
Episode: 7000
Mean Reward: 22.475
Episode: 8000
Mean Reward: 22.418
Episode: 9000
Mean Reward: 21.956
Episode: 10000
Mean Reward: 22.33
Episode: 11000
Mean Reward: 23.523
Episode: 12000
Mean Reward: 25.143
Episode: 13000
Mean Reward: 28.145
Episode: 14000
Mean Reward: 32.441
Episode: 15000
Mean Reward: 40.659
Episode: 16000
Mean Reward: 46.161
Episode: 17000
Mean Reward: 50.416
Episode: 18000
Mean Reward: 65.062
Episode: 19000
Mean Reward: 74.972
Episode: 20000
Mean Reward: 82.804
Episode: 21000
Mean Reward: 90.662
Episode: 22000
Mean Reward: 96.14
Episode: 23000
Mean Reward: 110.94
Episode: 24000
Mean Reward: 122.186
Episode: 25000
Mean Reward: 128.035
Episode: 26000
Mean Reward: 144.989
Episode: 27000
Mean Reward: 150.98
Episode: 28000


In [16]:
env.close()