This repository has been archived by the owner on May 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 138
/
tabular_q_learner.py
61 lines (49 loc) · 2.01 KB
/
tabular_q_learner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import random
import numpy as np
# tabular Q-learning where states and actions
# are discrete and stored in a table
class QLearner(object):
def __init__(self, state_dim,
num_actions,
init_exp=0.5, # initial exploration prob
final_exp=0.0, # final exploration prob
anneal_steps=500, # N steps for annealing exploration
alpha = 0.2,
discount_factor=0.9): # discount future rewards
# Q learning parameters
self.state_dim = state_dim
self.num_actions = num_actions
self.exploration = init_exp
self.init_exp = init_exp
self.final_exp = final_exp
self.anneal_steps = anneal_steps
self.discount_factor = discount_factor
self.alpha = alpha
# counters
self.train_iteration = 0
# table of q values
self.qtable = np.random.uniform(low=-1, high=1, size=(state_dim, num_actions))
def initializeState(self, state):
self.state = state
self.action = self.qtable[state].argsort()[-1]
return self.action
# select action based on epsilon-greedy strategy
def eGreedyAction(self, state):
if self.exploration > random.random():
action = random.randint(0, self.num_actions-1)
else:
action = self.qtable[state].argsort()[-1]
return action
# do one value iteration update
def updateModel(self, state, reward):
action = self.eGreedyAction(state)
self.train_iteration += 1
self.annealExploration()
self.qtable[self.state, self.action] = (1 - self.alpha) * self.qtable[self.state, self.action] + self.alpha * (reward + self.discount_factor * self.qtable[state, action])
self.state = state
self.action = action
return self.action
# anneal learning rate
def annealExploration(self, stategy='linear'):
ratio = max((self.anneal_steps - self.train_iteration)/float(self.anneal_steps), 0)
self.exploration = (self.init_exp - self.final_exp) * ratio + self.final_exp