<br>

## Loading libraries

In [None]:
import gym
import wandb
import numpy as np

wandb.login();

<br>
<br>
<br>

## Setting up wandb environment

In [None]:
# Training environment initialisations
PARAMS = {
    'n_states': 40,
    'episodes': 5000,
    'initial_lr': 1.0,
    'min_lr': 0.005,
    'gamma': 0.99,
    'epsilon': 0.05,
    'seed': 0
}       

wandb.init(project='MountainCarProblem', config=PARAMS, reinit=True)
wandb.save('./1_offline_tabular.ipynb')
PARAMS = wandb.config

<br>
<br>
<br>

## Creating and Exploring the environment

In [None]:
env_name = 'MountainCar-v0'
env = gym.make(env_name).unwrapped
env_low = env.observation_space.low
env_high = env.observation_space.high
q_table = np.zeros((PARAMS['n_states'], PARAMS['n_states'], env.action_space.n))

In [None]:
# Discretise the states
def discretization(obs):
    env_den = (env_high - env_low) / PARAMS['n_states']
    return ((obs - env_low) / env_den).astype(int)

<img src='tabular.png'>

In [None]:
render = False
env.seed(PARAMS['seed'])
np.random.seed(PARAMS['seed'])
for episode in range(PARAMS['episodes']):
    
    # [1] Reseting the environment and adjusting the learning rate
    obs = env.reset()
    alpha = max(PARAMS['min_lr'], PARAMS['initial_lr']*(PARAMS['gamma']**(episode//100)))
    wandb.log({'lr': alpha}, step=episode)
    
    # [2] Initiating the agent
    steps = 0
    while True:
        
        if render: env.render()
            
        # [3] Discretising the observed state
        pos, vel = discretization(obs)
        
        # [4] Choosing an action
        if np.random.uniform(low=0, high=1) < PARAMS['epsilon']:
            a = env.action_space.sample()
        else:
            a = np.argmax(q_table[pos][vel])
            
        # [5] Taking the action, receiving a reward
        obs, reward, done, _ = env.step(a)
        pos_, vel_ = discretization(obs)
        
        # [6] Q function update
        q_table[pos][vel][a] = (1-alpha)*q_table[pos][vel][a] + alpha*(reward+PARAMS['gamma']*np.max(q_table[pos_][vel_]))
        steps += 1
        if done:
            if episode in np.linspace(0, PARAMS['episodes'], 11).astype(int):
                print('Episode:', episode, steps)
            wandb.log({'steps': steps}, step=episode)
            break
            
np.save('./q_table', q_table)
wandb.save('./q_table.npy');