## SARSA with 10 value discretization

In [29]:
import collections, gym
import numpy as np
import matplotlib.pyplot as plt

In [30]:
def discrete_states(observation):

    ds_vector = np.zeros(6)
    ds=-1
    dis_number = 10
    
    
    #Discretize x
    if observation[0] <= -1:
        ds_vector[0] = -1
    elif observation[0] >= 1:
        ds_vector[0] = 1
    else:
        for i in range(dis_number):
            if -1 <= observation[0] <= (-1 + i*2/dis_number):
                ds_vector[0] = -1 + i*2/dis_number
                break


    #Discretize y
    if observation[1] <= -1:
        ds_vector[1] = -1
    elif observation[1] >= 1:
        ds_vector[1] = 1

    else:
        for i in range(dis_number):
            if -1 <= observation[1] <= (-1 + i*2/dis_number):
                ds_vector[1] = -1 + i*2/dis_number 
                break
                
                
    #Discretize Vx
    if observation[2] <= -1.5:
        ds_vector[2] = -1.5
    elif observation[2] >= 1.5:
        ds_vector[2] = 1.5

    else:
        for i in range(dis_number):
            if -1.5 <= observation[2] <= (-1.5 + i*3/dis_number):
                ds_vector[2] = -1.5 + i*3/dis_number 
                break    
    
    
    #Discretize Vy
    if observation[3] <= -1.5:
        ds_vector[3] = -1.5
    elif observation[3] >= 1.5:
        ds_vector[3] = 1.5

    else:
        for i in range(dis_number):
            if -1.5 <= observation[3] <= (-1.5 + i*3/dis_number):
                ds_vector[3] = -1.5 + i*3/dis_number 
                break        
    
    #Discretize theta
    if observation[4] <= -2:
        ds_vector[4] = -2
    elif observation[4] >= 2:
        ds_vector[4] = 2

    else:
        for i in range(dis_number):
            if -2 <= observation[4] <= (-2 + i*4/dis_number):
                ds_vector[4] = -2 + i*4/dis_number 
                break    
    
    
    #Discretize V_theta
    if observation[5] <= -6:
        ds_vector[5] = -6
    elif observation[5] >= 6:
        ds_vector[5] = 6

    else:
        for i in range(dis_number):
            if -6 <= observation[5] <= (-6 + i*12/dis_number):
                ds_vector[5] = -6 + i*12/dis_number 
                break    
                
    ds_vector[6] = observation[6]
    ds_vector[7] = observation[7]
    return ds_vector



def sa_key(s, a):
    return str(s) + " " + str(a)


def policy_explorer(s, Q, epsilon=20):
    rand = np.random.randint(0, 100)

    if rand >= epsilon:
        Qv = np.array([ Q[sa_key(s, action)] for action in [0, 1, 2, 3]])
        return np.argmax(Qv)
    else:
        return np.random.randint(0, 4)


In [32]:
def sarsa_lander(env, seed=None, render=False, num_iter=50, seg=50):
    env.seed(42)

    Q = collections.defaultdict(float)

    gamma = 0.95

    r_seq = []
    it_reward = []

    for it in range(num_iter):
        # initialize variables
        total_reward = 0
        steps = 0

        lr = 0.6

        # reset environment
        s = env.reset()

        ds = discrete_states(s)
        a = policy_explorer(ds, Q, it)
        # start Sarsa
        while True:
            sa = sa_key(ds, a)
            if render:
                env.render()
            sp, r, done, info = env.step(a)
            # update corresponding Q
            dsp = discrete_states(sp)
            ap = policy_explorer(dsp, Q, it)
            next_sa = sa_key(dsp, ap)
            if not done:
                Q[sa] += lr*(r + gamma * Q[next_sa] - Q[sa])
            else:
                Q[sa] += lr*(r - Q[sa])
            ds = dsp
            a = ap
            total_reward += r
            steps += 1
            if done or steps > 1000:
                it_reward.append(total_reward)
                break
        if it % seg == 0:
            avg_rwd = np.mean(np.array(it_reward))
            print("#It: ", it, " avg reward: ", avg_rwd, " out of ", len(it_reward), " trials")
            it_reward = []
            r_seq.append(avg_rwd)

    return Q, r_seq

num_iter = 10000

env = gym.make("LunarLander-v2")
Q, r_seq = sarsa_lander(env, render=False, num_iter=num_iter, seg=100)


y = np.array(r_seq)
x = np.linspace(0, num_iter, y.shape[0])
plt.figure(figsize=[10,10])
plt.plot(x, y)
plt.title('10 value discretized SARSA')
plt.savefig("10 value discretized SARSA")
plt.close()

np.savetxt("10 v sarsa.txt", y)

#It:  0  avg reward:  -270.76288673481804  out of  1  trials
#It:  100  avg reward:  -151.3316559044723  out of  100  trials
#It:  200  avg reward:  -201.0847388572972  out of  100  trials
#It:  300  avg reward:  -167.51017130558554  out of  100  trials
#It:  400  avg reward:  -183.28240188606392  out of  100  trials
#It:  500  avg reward:  -180.57487581029562  out of  100  trials
#It:  600  avg reward:  -181.07601001970994  out of  100  trials
#It:  700  avg reward:  -188.65748215688754  out of  100  trials
#It:  800  avg reward:  -176.55047523177464  out of  100  trials
#It:  900  avg reward:  -173.50428412735414  out of  100  trials
#It:  1000  avg reward:  -182.63355617796734  out of  100  trials
#It:  1100  avg reward:  -181.74209219686963  out of  100  trials
#It:  1200  avg reward:  -202.9611884873548  out of  100  trials
#It:  1300  avg reward:  -195.339948063424  out of  100  trials
#It:  1400  avg reward:  -197.19200395417704  out of  100  trials
#It:  1500  avg reward:  -174

## Q-Learning with 10-State Discretization

In [28]:
def qlearning_lander(env, learningrate=0.01, render=True, num_iter=100, seg=100):
    Q = collections.defaultdict(float)
    gamma = 0.95
    r_seq = []
    it_reward = []

    for it in range(num_iter):
        # initialize variables
        total_reward = 0
        steps = 0

        # reset environment
        s = env.reset()

        ds = discrete_states(s)
        a = policy_explorer(ds, Q) #initial action is e-greedy
        # start Sarsa
        while True:
            # discrete state ds initialized either before loop or at end of previous step
            a = policy_explorer(ds, Q) #initial action is e-greedy
            sa = sa_key(ds, a)
            if render:
                env.render()
            sp, r, done, info = env.step(a)
            # update corresponding Q
            dsp = discrete_states(sp)
            a_max = policy_explorer(dsp, Q, epsilon=0) #now select action as pure-greedy
            next_sa = sa_key(dsp, a_max)
            if not done:
                Q[sa] += learningrate*(r + gamma * Q[next_sa] - Q[sa])
            else:
                Q[sa] += learningrate*(r - Q[sa])
            ds = dsp
            total_reward += r
            steps += 1
            if done or steps > 1000:
                it_reward.append(total_reward)
                break
        if it % seg == 0:
            avg_rwd = np.mean(np.array(it_reward))
            print("#It: ", it, " avg reward: ", avg_rwd, " out of ", len(it_reward), " trials")
            it_reward = []
            r_seq.append(avg_rwd)

    return Q, r_seq

num_iter = 10000
env = gym.make("LunarLander-v2")

alphas = [0.6]
for a in alphas:
    Q, r_seq = qlearning_lander(env, learningrate=a, render=False, num_iter=num_iter, seg=100)
    
y = np.array(r_seq)
x = np.linspace(0, num_iter, y.shape[0])
plt.figure(figsize=[10,10])
plt.plot(x, y)
plt.grid()
plt.title(f'10 value discretized Q-Learning')
plt.savefig(f"10 value discretized Q-Learning")
plt.close()

np.savetxt("10 v Q-Learning.txt", y)

#It:  0  avg reward:  -126.28280264053126  out of  1  trials
#It:  100  avg reward:  -159.06212688537084  out of  100  trials
#It:  200  avg reward:  -145.1948299140876  out of  100  trials
#It:  300  avg reward:  -138.0241256207595  out of  100  trials
#It:  400  avg reward:  -151.7131809690659  out of  100  trials
#It:  500  avg reward:  -117.07075064272806  out of  100  trials
#It:  600  avg reward:  -112.74028272597319  out of  100  trials
#It:  700  avg reward:  -127.2454259802638  out of  100  trials
#It:  800  avg reward:  -104.72805764074965  out of  100  trials
#It:  900  avg reward:  -123.98609797971591  out of  100  trials
#It:  1000  avg reward:  -115.77207069122045  out of  100  trials
#It:  1100  avg reward:  -128.10005793801017  out of  100  trials
#It:  1200  avg reward:  -125.10144070557935  out of  100  trials
#It:  1300  avg reward:  -143.5302315308204  out of  100  trials
#It:  1400  avg reward:  -137.40102339788177  out of  100  trials
#It:  1500  avg reward:  -112