# Software agents -  Lunar lander v2

### Gediminas Sadaunykas and Vaida Gulbinskaite

*** Aim:***  Navigate a Lunar Lander to its landing strip. 

***Agent's Algorithm:***  Deep Q learning

***NOTE:*** *Please install OpenAi Gym and keras in order to run this* 

https://discuss.openai.com/t/installing-openai-gym-universe-on-windows/2092

*please install pyglet version 1.2.4 instead of 1.3 (otherwise visuals do not work!!)* 

______________________________________________________________________________________________________________________
***Code testing note:*** *Please note that all number of episodes were changed to 1 for ease of running (otherwise,  this code takes about 3 weeks to run fully)* 
______________________________________________________________________________________________________________________

### Step 1:  Load all libraries necesary

In [2]:
import gym
from keras import  * 
from keras.layers.core import Dense ##thought the above should have imported everything but apparently didn't -_- oh well! 
from keras.optimizers import Adam
from keras import backend as K
import numpy as np
import pyglet
import matplotlib.pyplot as plt
from collections import deque
import random
import six
%matplotlib inline 
import time
import pandas as pd
import scipy as sp
tall_st = time.time()

Using TensorFlow backend.


### Srep 2:  Load environment from OpenAi Gym

In [3]:
##Will use https://gym.openai.com/envs/LunarLander-v2/
env = gym.make('LunarLander-v2') #4 actions 8 states
print(env.observation_space)
print(env.action_space)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Box(8,)
Discrete(4)


### Step 3: Create Agent

##### Term Dictionary

**Code:**   https://keon.io/deep-q-learning/  adapted to solve discrete Lunar lander environment

* **Dense()**  =  basic form of NN layer

* **Sequential()**  =  Creates the foundation of NN layers

In [4]:
##Please note:  initial neural network is done with 30 neurons in each layer and 

class LLAgent:
    def __init__(self, state_size, action_size, gamma, epsilon, epsilon_decay, lrn_rate, neurons_1stlayer=30,
                neurons_2ndlayer=30, activation_1stlayer='relu', activation_2ndlayer='relu'): ## Set hyperparameters 
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1000000)
        self.gamma = gamma    # discount rate (used to calculate the future discounted reward)
        self.epsilon = epsilon  # exploration rate (the rate in which an agent randomly decides its action rather than prediction)
        self.epsilon_min = 0.01 ## explore at leasr this much
        self.epsilon_decay = epsilon_decay # keep decreasing the number of explorations as the agent gets better
        self.lrn_rate = lrn_rate # learnong rate (how much model learns each iteration)
        self.DQL_MODEL = self._build_DQL_MODEL()
        self.target_DQL_MODEL = self._build_DQL_MODEL()
        self.update_target_DQL_MODEL()

    def _huber_loss(self, target, prediction): ## Loss function
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)

    def _build_DQL_MODEL(self,neurons_1stlayer=30, neurons_2ndlayer=30, activation_1stlayer='relu', activation_2ndlayer='relu'):## Neural Net for Deep-Q learning model
        DQL_MODEL = Sequential()
        DQL_MODEL.add(Dense(neurons_1stlayer, input_dim=self.state_size, activation=activation_1stlayer)) # Input Layer w/ state size(4) and Hidden Layer w/ 30 n's
        DQL_MODEL.add(Dense(neurons_2ndlayer, activation=activation_2ndlayer)) # hidden layer with 30 nodes
        DQL_MODEL.add(Dense(self.action_size, activation='linear')) # output Layer:  number of actions with 4 nodes 
        DQL_MODEL.compile(loss=self._huber_loss, optimizer=Adam(lr=self.lrn_rate))
        return DQL_MODEL

    def update_target_DQL_MODEL(self): ## Copy weights from DQL_MODEL to target_DQL_MODELff
        self.target_DQL_MODEL.set_weights(self.DQL_MODEL.get_weights())

    def remember(self, state, action, reward, next_state, done): ## Append state, action, reward, and next state to the memory
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state): ## How the agent decides to act
        if np.random.rand() <= self.epsilon: #  acts randomly with constraint of epsilon (exploration vs exploitation)
            return random.randrange(self.action_size)
        act_values = self.DQL_MODEL.predict(state) # predicts the reward value based on the given state
        return np.argmax(act_values[0]) # returns the action based on predicted reward

    def replay(self, batch_size): ## Trains the neural net with experiences in the memory
        minibatch = random.sample(self.memory, batch_size) # randomly sampled elements of the memories of size batch_size 
        for state, action, reward, next_state, done in minibatch: # extract informations from each memory
            target = self.DQL_MODEL.predict(state) 
            if done:
                target[0][action] = reward  # Target reward
            else:  #predict the future discounted reward
                a = self.DQL_MODEL.predict(next_state)[0]
                t = self.target_DQL_MODEL.predict(next_state)[0]
                target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.DQL_MODEL.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min: # when min exploration reached,apply exploration decay
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.DQL_MODEL.load_weights(name)

    def save(self, name):
        self.DQL_MODEL.save_weights(name)


### Step 4: Train the agent

In [5]:
def run_learning(state_size, action_size,gamma, epsilon, epsilon_decay, lrn_rate,  neurons_1stlayer,
                neurons_2ndlayer, activation_1stlayer, activation_2ndlayer, episodes=1000):
    agent = LLAgent(state_size, action_size, gamma, epsilon,epsilon_decay, lrn_rate,neurons_1stlayer,
                neurons_2ndlayer, activation_1stlayer, activation_2ndlayer)
    results = {}
    reward_avg=0
    for e in range(episodes):
        #print('episode=', e)
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        #agent.load("Software agents/LunarLander-dql_%s_%s_%s_%s_%s.h5" % (gamma, epsilon, epsilon_min, epsilon_decay, lrn_rate)) #load weights
        frames=0 # frame counter
        reward_episode=0 # reward counter
        done=False # false if episode not done
        while not done: #if not landed
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward_episode+=reward # Get total reward which could be used as score function
            #print(reward)
            frames+=1
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                agent.update_target_DQL_MODEL()  # update target
                results[e] = reward_episode #update reward for the episode
                print("episode: {}/{},frames:{},reward_episode:{}"
                      .format(e, episodes, frames, reward_episode)) #display episodic updates
                #plt.imshow(env.render(mode='rgb_array')) # display animation Only works with downgraded pyglet 
            if e==1000: #Stop if agent takes more than 1000 frames to make the algorithm faster and avoid the agent just floating around for a long while 
                break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size) #re-sample agent memory
        if e>(e-100):
            reward_avg=reward_episode/100 # The score of the episode is defined as an average of the last 100 frames
        #if e % 10 == 0:
         #    agent.save("Software agents/LunarLander-dql_%s_%s_%s_%s_%s.h5" % (gamma, epsilon, epsilon_min, epsilon_decay, lrn_rate)) #Save weights
    return reward_avg, results


In [6]:
# Default run
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 128 # deepmind reference


default_score, default_results=run_learning(state_size, action_size,gamma=0.99, epsilon =  0.8, epsilon_decay = 0.999, lrn_rate = 0.00025,  
                           neurons_1stlayer=30,neurons_2ndlayer=30, activation_1stlayer  = 'relu', activation_2ndlayer= 'relu',episodes  = 1) #episodes=50000)

episode: 0/1,frames:112,reward_episode:-133.46327640113435


In [7]:
#Convert to dataframe
s = pd.Series(default_results, name='Score')
df_initial = pd.DataFrame(s)
#Rolling mean to smoothe the curve
#roll = df_initial.rolling(1000, min_periods=100, freq=None, center=False, win_type=None, on=None, axis=0, closed=None)
#test = df_initial.rolling(5000, win_type='triang').mean()
#df_initial.to_csv("/Software agents/Initial_results.csv")

### Step 5. Grid search (Searching for the best parameters)

In [8]:
###Run via google cloud###
#Full grid could not be run due to computational issues but we tried our best
#Env constants definition
state_size = env.observation_space.shape[0]
batch_size = 128 # deepmind reference


gamma_grid=[0.1,0.5,0.9]
epsilon_grid=[0.1,0.5,0.9]
epsilon_grid=[0.1,0.5,0.9]
epsilon_decay_grid=[0.99,0.5,0.1]
lrn_rate_grid=[0.0001,0.01]


score_dictionary1={}
Grid_time_st = time.time()

for gamma in gamma_grid:
    for epsilon in epsilon_grid:
        for epsilon_decay in epsilon_decay_grid:
            for lrn_rate in lrn_rate_grid:
                Iter_time_st=time.time() # Time start iteration
                ##parameters used
                score, results=run_learning(state_size, action_size, gamma, epsilon, epsilon_decay, lrn_rate, neurons_1stlayer=30,
            neurons_2ndlayer=30, activation_1stlayer='relu', activation_2ndlayer='relu',episodes = 1)# episodes=20000) ##Initian NN structure was used for this
                Iter_time_en = time.time()
                Duration_iter=Iter_time_en-Iter_time_st
                #print('Gamma: %s ; Epsilon: %s; Epsilon_decay: %s; Lrn_rate: %s; Score: score; Duration: %s' 
                #     % (gamma, epsilon, epsilon_decay, lrn_rate, score, Duration_iter))
                #print('-'*80)
                score_dictionary1[(gamma, epsilon, epsilon_decay, lrn_rate)]=score 
                s = pd.Series(results, name='Score')
                df = pd.DataFrame(s)
       #         df.to_csv("/Software Agents/Results_Gamma%s_Epsilon%s_Epsdecay%s_LrnRate%s.csv" % (gamma, epsilon,  epsilon_decay, lrn_rate))
##All exported to excel files,  which were later used to create grid parameter table
#To say that one model was the best over other was unfair because we could not run the full grid

episode: 0/1,frames:63,reward_episode:-266.00610505123325
episode: 0/1,frames:51,reward_episode:-395.71406466871446
episode: 0/1,frames:55,reward_episode:-227.744211491189
episode: 0/1,frames:54,reward_episode:-472.70384547118607
episode: 0/1,frames:99,reward_episode:-421.5338671098605
episode: 0/1,frames:63,reward_episode:-610.1420307269036
episode: 0/1,frames:57,reward_episode:-153.1558271916352
episode: 0/1,frames:69,reward_episode:-210.8326058848172
episode: 0/1,frames:82,reward_episode:-603.8859190058254
episode: 0/1,frames:87,reward_episode:-592.6088803791188
episode: 0/1,frames:76,reward_episode:-431.91237590965727
episode: 0/1,frames:59,reward_episode:-439.8487388462527
episode: 0/1,frames:69,reward_episode:-144.57662967517615
episode: 0/1,frames:89,reward_episode:-133.5989421688223
episode: 0/1,frames:69,reward_episode:-197.37580634866504
episode: 0/1,frames:81,reward_episode:-249.9721585621148
episode: 0/1,frames:102,reward_episode:-461.00446551580757
episode: 0/1,frames:107,

### Step 6. Neural network Grid search (Searching for the optiman number of neurons and best activation function)

##### Change neural net sizes and activation functions

In [9]:
###Run via google cloud###
neurons_layer_grid=[6,25,50] 
activation_layer_grid=['sigmoid', 'relu' ,'leaky_relu']
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
#Grid search with base parameters
gamma = 0.99 
epsilon =  0.8
epsilon_decay = 0.999
lrn_rate = 0.00025



score_dictionary_4={}
episodes = {} ## Store the episode information:  Key = Episode,  Value = (episode, score,  epsilon) 
if __name__ == "__main__":
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = LLAgent(state_size, action_size,state_size, action_size, gamma, epsilon, epsilon_decay, lrn_rate)
    done = False
    batch_size = 128
     
        
for neurons_layer in neurons_layer_grid:
    for activation_layer in activation_layer_grid:
        score, results=run_learning(state_size, action_size, gamma, epsilon, epsilon_decay, lrn_rate,neurons_layer,neurons_layer, activation_layer, activation_layer,episodes = 1) #episodes=10000)
        # Use parameters from grid search
        score_dictionary1[(gamma, epsilon, epsilon_decay, lrn_rate)]=score 
        s = pd.Series(results, name='Score')
        df = pd.DataFrame(s)
#        df.to_csv("/Users/vgulbi/Dropbox/Team Lithuania/Software agents/BEST_Results_layer%s_Activation_%s.csv" % (neurons_layer, activation_layer))
##All exported to excel and used to create grid search table as well as determine the best model                   

episode: 0/1,frames:119,reward_episode:-74.00599787266466
episode: 0/1,frames:100,reward_episode:-161.23227653720195
episode: 0/1,frames:69,reward_episode:-466.2225772356683
episode: 0/1,frames:98,reward_episode:-398.47044658822375
episode: 0/1,frames:123,reward_episode:-176.8107512949634
episode: 0/1,frames:57,reward_episode:-322.9454130921479
episode: 0/1,frames:115,reward_episode:-114.75757710301312
episode: 0/1,frames:115,reward_episode:-187.60484029009888
episode: 0/1,frames:87,reward_episode:-555.8039231867073


In [None]:
##Best parameters selecred from the grid search table (where average of the last 100 episodes was the largest)
# Best of grid search
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 128 # deepmind reference


default_score, default_results=run_learning(state_size, action_size,gamma=0.99, epsilon =  0.8, epsilon_decay = 0.999, lrn_rate = 0.00025,  
                           neurons_1stlayer=50,neurons_2ndlayer=50, activation_1stlayer  = 'sigmoid', activation_2ndlayer= 'sigmoid', episodes  = 1)#, episodes=50000)

In [None]:
#Convert to dataframe
s = pd.Series(default_results, name='Score')
df_initial = pd.DataFrame(s)
#Rolling mean to smoothe the curve
#roll = df_initial.rolling(1000, min_periods=100, freq=None, center=False, win_type=None, on=None, axis=0, closed=None)
#test = df_initial.rolling(5000, win_type='triang').mean()
#df_initial.to_csv("/Software agents/Initial_results.csv")