## SARSA with Optimized Discretization of States

In [1]:
'''
Reference: Soham Gadgil, Yunfeng Xin, and Chengzhe Xu. “Solving the lunar lander problem under uncertainty using
reinforcement learning”. In: 2020 SoutheastCon. Vol. 2. IEEE. 2020, pp. 1–8.
'''

import collections, gym

import numpy as np
import matplotlib.pyplot as plt
# import import_ipynb
# import lunar_lander as lander

num_iter = 10000




In [2]:

def discrete_states(s):
    state = (min(5, max(-5, int((s[0]) / 0.05))), \
            min(5, max(-1, int((s[1]) / 0.1))), \
            min(3, max(-3, int((s[2]) / 0.1))), \
            min(3, max(-3, int((s[3]) / 0.1))), \
            min(3, max(-3, int((s[4]) / 0.1))), \
            min(3, max(-3, int((s[5]) / 0.1))), \
            int(s[6]), \
            int(s[7]))

    return state


def sa_key(s, a):
    return str(s) + " " + str(a)


def policy_explorer(s, Q, iter, epsilon=50):
    rand = np.random.randint(0, 100)
    if iter > 200:
        epsilon = 10
    if iter > 2000:
        epsilon = 5
    if iter > 5000:
        epsilon = 1
    if iter > 7500:
        epsilon = 0

    if rand >= epsilon:
        Qv = np.array([ Q[sa_key(s, action)] for action in [0, 1, 2, 3]])
        return np.argmax(Qv)
    else:
        return np.random.randint(0, 4)

In [10]:
def sarsa_lander(env, seed=None, render=False, num_iter=50, seg=50):
    env.seed()

    Q = collections.defaultdict(float)

    gamma = 0.95

    r_seq = []
    it_reward = []

    for it in range(num_iter):
       
        total_reward = 0
        steps = 0
        lr = 0.6
        # reset environment
        s = env.reset()
        ds = discrete_states(s)
        a = policy_explorer(ds, Q, it)

        while True:
            # use a policy generator to guide sarsa exploration
            # step and get feedback
            sa = sa_key(ds, a)
            sp, r, done, info = env.step(a)
            # update corresponding Q
            dsp = discrete_states(sp)
            ap = policy_explorer(dsp, Q, it)

            next_sa = sa_key(dsp, ap)
            if not done:
                Q[sa] += lr*(r + gamma * Q[next_sa] - Q[sa])
            else:
                Q[sa] += lr*(r - Q[sa])
            ds = dsp
            a = ap
            total_reward += r
            if render and it % seg == 0:
                still_open = env.render()
                if still_open == False: break
            steps += 1
            if done or steps > 1000:
                it_reward.append(total_reward)
                break
        if it % seg == 0:
            avg_rwd = np.mean(np.array(it_reward))
            print("#It: ", it, " avg reward: ", avg_rwd, " out of ", len(it_reward), " trials")
            it_reward = []
            r_seq.append(avg_rwd)

    return Q, r_seq

env = gym.make("LunarLander-v2")
Q, r_seq = sarsa_lander(env, render=False, num_iter=num_iter, seg=100)

y = np.array(r_seq)
x = np.linspace(0, num_iter, y.shape[0])
plt.figure(figsize=[10,10])
plt.plot(x, y)
plt.grid()
plt.title('Optimized SARSA')
plt.savefig("Optimized SARSA")
plt.close()


np.savetxt("Optimized Sarsa.txt", y)


#It:  0  avg reward:  -123.50767522226923  out of  1  trials
#It:  100  avg reward:  -154.67881791532855  out of  100  trials
#It:  200  avg reward:  -132.5644927010207  out of  100  trials
#It:  300  avg reward:  -119.39376858129397  out of  100  trials
#It:  400  avg reward:  -113.75123054833745  out of  100  trials
#It:  500  avg reward:  -98.37469327941379  out of  100  trials
#It:  600  avg reward:  -108.50034971341366  out of  100  trials
#It:  700  avg reward:  -113.96113863639437  out of  100  trials
#It:  800  avg reward:  -114.75605203256808  out of  100  trials
#It:  900  avg reward:  -112.04838291388563  out of  100  trials
#It:  1000  avg reward:  -94.33880328512008  out of  100  trials
#It:  1100  avg reward:  -113.30820170141833  out of  100  trials
#It:  1200  avg reward:  -108.77540549866767  out of  100  trials
#It:  1300  avg reward:  -94.7367894011091  out of  100  trials
#It:  1400  avg reward:  -114.85590426688753  out of  100  trials
#It:  1500  avg reward:  -85.

In [9]:
def qlearning_lander(env, learningrate=0.01, render=True, num_iter=100, seg=100):
    Q = collections.defaultdict(float)
    gamma = 0.95
    r_seq = []
    it_reward = []

    for it in range(num_iter):
        # initialize variables
        total_reward = 0
        steps = 0

        # reset environment
        s = env.reset()

        ds = discrete_states(s)
        a = policy_explorer(ds, Q, it)
        while True:
            a = policy_explorer(ds, Q, it)
            sa = sa_key(ds, a)
            if render:
                env.render()
            sp, r, done, info = env.step(a)
            # update corresponding Q
            dsp = discrete_states(sp)
            a_max = policy_explorer(dsp, Q, it, epsilon=0) #pure-greedy
            next_sa = sa_key(dsp, a_max)
            if not done:
                Q[sa] += learningrate*(r + gamma * Q[next_sa] - Q[sa])
            else:
                Q[sa] += learningrate*(r - Q[sa])
            ds = dsp
            total_reward += r
            steps += 1
            if done or steps > 1000:
                it_reward.append(total_reward)
                break
        if it % seg == 0:
            avg_rwd = np.mean(np.array(it_reward))
            print("#It: ", it, " avg reward: ", avg_rwd, " out of ", len(it_reward), " trials")
            it_reward = []
            r_seq.append(avg_rwd)

    return Q, r_seq

env = gym.make("LunarLander-v2")

alphas = [0.6]
for a in alphas:
    Q, r_seq = qlearning_lander(env, learningrate=a, render=False, num_iter=num_iter, seg=100)

y = np.array(r_seq)
x = np.linspace(0, num_iter, y.shape[0])
plt.figure(figsize=[10,10])
plt.plot(x, y)
plt.grid()
plt.title('Optimized Discretization Q-Learning')
plt.savefig("Optimized Discretization Q-Learning")
plt.close()


np.savetxt("Optimized Q-Learning.txt", y)

#It:  0  avg reward:  -99.36960581276084  out of  1  trials
#It:  100  avg reward:  -121.8613190458557  out of  100  trials
#It:  200  avg reward:  -105.53823648786073  out of  100  trials
#It:  300  avg reward:  -99.69452819856653  out of  100  trials
#It:  400  avg reward:  -97.94527293860774  out of  100  trials
#It:  500  avg reward:  -95.46232142394841  out of  100  trials
#It:  600  avg reward:  -117.08708416159566  out of  100  trials
#It:  700  avg reward:  -101.33547019958104  out of  100  trials
#It:  800  avg reward:  -97.81620456782031  out of  100  trials
#It:  900  avg reward:  -110.0181480177172  out of  100  trials
#It:  1000  avg reward:  -87.3408086567052  out of  100  trials
#It:  1100  avg reward:  -95.33641048011064  out of  100  trials
#It:  1200  avg reward:  -100.48655821225995  out of  100  trials
#It:  1300  avg reward:  -122.668858203247  out of  100  trials
#It:  1400  avg reward:  -102.5334333182834  out of  100  trials
#It:  1500  avg reward:  -110.3779979