# Reinforcement Learning Assignment - Youssef IRHBOULA

## Importing libraries

In [1]:
import numpy as np 
from tqdm import tqdm
import plotly.graph_objects as go
import os, sys
import gymnasium as gym
import time
import text_flappy_bird_gym
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import defaultdict

## Creating a first agent: SARSA agent

In [21]:
class AgentSARSA():
    def __init__(self, epsilon, alpha, gamma, env):
        
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.env= env
        
    def map_space(self):
        self.map_dict={}
        c=0
        for i in range(self.env.observation_space[0].start,self.env.observation_space[0].n):
            for j in range(self.env.observation_space[1].start-1,self.env.observation_space[1].n):
                self.map_dict[(i,j)]=c
                c+=1

    def agent_init(self):
        self.map_space()
        self.Q =np.zeros((len(self.map_dict), self.env.action_space.n))   

    def argmax(q_values):
        ties = np.argwhere(q_values == np.amax(q_values)).flatten()
        return np.random.choice(ties)
    
    def agent_step(self,obs):
        if np.random.rand()>self.epsilon:
            action=np.argmax(self.Q[obs,:])
        else : 
            action=self.env.action_space.sample()
        return action
    
    def policy(self,obs):
        obs=self.map_dict[obs]
        return np.argmax(self.Q[obs,:])
    
    def Q_update(self,new_obs,new_action,reward):
        self.Q[self.last_state,self.last_action]+=\
        self.alpha*(reward+self.gamma*self.Q[new_obs, new_action]-self.Q[self.last_state,self.last_action]) 
        
    def train(self,num_iter,history=False,N_=500,max_score=500):
        self.agent_init()
        self.rewards=[]
        self.S=[]
        self.R=[]
        self.num_ep=[]
        for i in tqdm(range(num_iter)):

            reward_i=0
            done = False

            obs, _ = self.env.reset()
            obs= self.map_dict[obs]
            action = self.agent_step(obs)

            self.last_state=obs
            self.last_action=action

            while True:
                obs, reward, done, _, _ = self.env.step(action)
                obs=self.map_dict[obs]

                action = self.agent_step(obs)
                self.Q_update(obs,action,reward)

                reward_i+=reward
                self.last_action=action
                self.last_state=obs
                if done:
                    self.rewards.append(reward_i)
                    break
            if history : 
                if (i<=1000 and i%50==0) or (i>1000 and i%200==0):
                    self.num_ep.append(i)
                    scores=[]
                    rewards=[]
                    for _ in range(N_):
                        obs ,_= self.env.reset()
                        done=False
                        score=0
                        r=0
                        while True and score<max_score : #after max score is reached we consider the game finished
                            # choose action according to policy
                            action = self.policy(obs)  
                            # make the action
                            obs, reward, done, _, info = self.env.step(action)
                            r+=reward
                            score=info["score"]
                            # finish here if done (player dead)
                            if done:
                                break
                        scores.append(score)
                        rewards.append(r)
                    self.S.append(np.mean(scores))
                    self.R.append(np.mean(rewards))

    def plot_reward(self,num_iter):
        fig=go.Figure()
        fig.add_trace(go.Scatter(x=np.arange(self.num_iter),y=self.rewards,name="rewards during Train"))
        fig.update_layout(title="Rewards during Train for SARSA Agent",xaxis_title="Episode Number",yaxis_title="Rewards")
        fig.show()

In [27]:
env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
agentSARSA = AgentSARSA(0.9,0.1,1,env)
agentSARSA.train(10000,history=True)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [29:53<00:00,  5.58it/s]


In [28]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=agentSARSA.num_ep,y=agentSARSA.S,line_color="royalblue"))

## Second agent: Monte Carlo

In [46]:
class AgentMC():
    def __init__(self, epsilon, batch_size, gamma, env):
        self.epsilon = epsilon
        self.batch_size=batch_size
        self.env=env
        self.gamma=gamma
    def map_space(self):
        self.map_dict={}
        cpt=0
        for i in range(self.env.observation_space[0].start,self.env.observation_space[0].n):
            for j in range(self.env.observation_space[1].start-1,self.env.observation_space[1].n):
                self.map_dict[(i,j)]=cpt
                cpt+=1
    
    def agent_init(self):
        self.map_space()
        self.Q =np.zeros((len(self.map_dict), self.env.action_space.n))   
        self.R =np.zeros((len(self.map_dict), self.env.action_space.n)) 
        self.N_samples =np.zeros((len(self.map_dict), self.env.action_space.n)) 

    def argmax(q_values):
        ties = np.argwhere(q_values == np.amax(q_values)).flatten()
        return np.random.choice(ties)
    
    def policy(self,obs):
        obs=self.map_dict[obs]
        return np.argmax(self.Q[obs,:])
   
    def agent_step(self,obs):
        obs=self.map_dict[obs]
        if np.random.rand()>self.epsilon:
            action=np.argmax(self.Q[obs,:])
        else : 
            action=self.env.action_space.sample()
        self.last_action=action
        return action
    
    def generate_episode(self):
        episode = []
        obs, _ = self.env.reset()
        reward_episode=0
        for _ in range(self.batch_size):
            action = self.agent_step(obs)
            new_obs, reward, done, _, _ = self.env.step(action)
            
            episode.append((obs, action, reward))
            reward_episode+=reward
            if done:
                break
            obs = new_obs
        return episode,reward_episode

    def train(self,num_iterations,history=False,N_test=500,max_score=500):
        self.agent_init()
        self.rewards=[]
        self.S=[]
        self.Rew=[]
        self.num_ep=[]
        for j in tqdm(range(num_iterations)):
            
            episode,reward_episode = self.generate_episode()
            self.rewards.append(reward_episode)
            rewards,state_action=[],[]
            for i in range(len(episode)):
                rewards.append(episode[i][2])
                state_action.append((episode[i][0],episode[i][1]))
            
            for t, (o, a, _) in enumerate(episode):
                if not (o, a) in state_action[:t]:
                    o=self.map_dict[o]
                    self.R[o,a] += self.gamma*sum(rewards[t:])
                    self.N_samples[o,a] += 1
                    self.Q[o,a] = (
                        self.R[o,a] / self.N_samples[o,a]
                    )
            if history : 
                if (i<=1000 and i%50==0) or (i>1000 and i%200==0):
                    self.num_ep.append(j)
                    scores=[]
                    rew=[]
                    for _ in range(N_test):
                        obs ,_= self.env.reset()
                        done=False
                        score=0
                        r=0
                        while True and score<max_score : #after max score is reached we consider the game finished
                            # choose action according to policy
                            action = self.policy(obs)  
                            # make the action
                            obs, reward, done, _, info = self.env.step(action)
                            r+=reward
                            score=info["score"]
                            # finish here if done (player dead)
                            if done:
                                break
                        scores.append(score)
                        rew.append(r)
                    self.S.append(np.mean(scores))
                    self.Rew.append(np.mean(rew))
       
    def plot_reward(self):
        fig=go.Figure()
        fig.add_trace(go.Scatter(x=np.arange(self.num_iterations),y=self.rewards,name="rewards during Train"))
        fig.update_layout(title="Rewards during Train for Monte-Carlo Agent",xaxis_title="Episode Number",yaxis_title="Rewards")
        fig.show()

In [32]:
env = gym.make('TextFlappyBird-v0', height = 15, width = 20, pipe_gap = 4)
agentMC = AgentMC(0.1,100,0.5,env)
agentMC.train(10000,history=True)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [22:34<00:00,  7.38it/s]


In [34]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=agentMC.num_ep,y=agentMC.S))

## Influence of parameters

### SARSA agent

#### epsilon

In [39]:
fig=go.Figure()
eps=[0.01,0.1,0.3,0.6,0.9,0.99]
agents=[]
    
for i,e in enumerate(eps):
    S=SARSAgent(e, .1, 1, env)
    S.train(5000,history=True,N_test=100,max_score=1000)
    fig.add_trace(go.Scatter(x=S.num_ep,y=S.S,name=f"Epsilon = {e}"))
fig.show()

100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:03<00:00, 1479.46it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:03<00:00, 1436.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:07<00:00, 673.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:53<00:00, 43.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [05:56<00:00, 14.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:21<00:00, 19.15it/s]


In [40]:
alpha=[0.01,0.1,0.3,0.6,0.9,0.99]
fig=go.Figure()
for i,a in enumerate(alpha):
    S=SARSAgent(.6, a, 1, env)
    S.train(5000,history=True,N_test=100,max_score=1000)
    fig.add_trace(go.Scatter(x=S.num_ep,y=S.S,name=f"Alpha = {a}"))
fig.show()

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:14<00:00, 338.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [02:01<00:00, 41.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [11:49<00:00,  7.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [03:10<00:00, 26.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:03<00:00, 79.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:34<00:00, 144.20it/s]


In [42]:
gamma=[0.01,0.1,0.3,0.6,0.9,0.99]
fig=go.Figure()
for i,g in enumerate(gamma):
    S=SARSAgent(.6, .1, g, env)
    S.train(5000,history=True,N_test=100,max_score=1000)
    fig.add_trace(go.Scatter(x=S.num_ep,y=S.S,name=f"Discount = {a}"))
fig.show()

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:07<00:00, 703.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:15<00:00, 329.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:27<00:00, 181.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [06:56<00:00, 12.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:04<00:00, 20.47it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [05:47<00:00, 14.40it/s]


### MC agent

In [48]:
fig=go.Figure()
eps=[0.01,0.1,0.3,0.6,0.9,0.99]
    
for i,e in enumerate(eps):
    MC=AgentMC(e, 100, 1, env)
    MC.train(5000,history=True,N_test=100,max_score=1000)
    fig.add_trace(go.Scatter(x=MC.num_ep,y=MC.S,name=f"Epsilon = {e}"))
fig.show()

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:10<00:00, 475.87it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [02:42<00:00, 30.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:52<00:00, 17.06it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 437.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2280.00it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2660.45it/s]


In [51]:
fig=go.Figure()
gamma=[0.01,0.1,0.3,0.6,0.9,0.99]
    
for i,g in enumerate(gamma):
    MC=AgentMC(0.1, 100, g, env)
    MC.train(5000,history=True,N_test=100,max_score=1000)
    fig.add_trace(go.Scatter(x=MC.num_ep,y=MC.S,name=f"Discount = {g}"))
fig.show()

100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [03:07<00:00, 26.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:27<00:00, 181.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:25<00:00, 58.47it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:21<00:00, 19.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [04:53<00:00, 17.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:13<00:00, 67.72it/s]


In [52]:
fig=go.Figure()
MC=AgentMC(.1, 100, .9, env)
MC.train(10000,history=True,N_test=100,max_score=1000)
fig.add_trace(go.Scatter(x=MC.num_ep,y=MC.S,name='Monte-Carlo'))
S = AgentSARSA(0.5,0.1,0.9,env)
S.train(10000,history=True)
fig.add_trace(go.Scatter(x=S.num_ep,y=S.S,name='SARSA'))

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [13:32<00:00, 12.30it/s]
100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [16:24<00:00, 10.16it/s]


## Value state functions

In [57]:
def td_prediction(env, agent, ep, gamma, alpha):
    V = defaultdict(float)
    for i in tqdm(range(ep)):
        S,_ = env.reset()
        score=0
        while score<1000:
            A =  agent.policy(S)
            state_S, state_R, state_done,_,state_info= env.step(A) 
            V[S] +=  alpha*(state_R+gamma*V[state_S]-V[S])
            S = state_S
            score=state_info["score"]
            if state_done: 
                break
    return V

In [58]:
fig=make_subplots(rows=1,cols=2)
agents=[S,MC]
Y,Y_=env.observation_space[0].start,env.observation_space[0].n
X,X_=env.observation_space[1].start-1,env.observation_space[1].n
for q in range(2):
    V = td_prediction(env, agents[q], ep=100, gamma=0.9, alpha=0.1)
    V_mesh=[]
    for i in range(Y,Y_):
            cache=[]
            for j in range(X,X_):
                    cache.append(V[(i,j)])
            V_mesh.append(cache)
    fig.add_trace(go.Heatmap(z=V_mesh,y=np.arange(Y,Y_),x=np.arange(X,X_)),row=1,col=q+1)

fig.update_layout(title='Value-State Function',
                  xaxis_title='Y',
                  yaxis_title='X',
                  coloraxis_colorbar=dict(title='Value'),title_x=0.5)
fig.show()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:24<00:00,  4.04it/s]
