In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 10.0)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
import q_learner
from q_learner import QLearner, QNetwork
from labeling_network import FullyConnectedLayer, linear

In [None]:
layer = FullyConnectedLayer(3, 2, activation_fn=linear)

In [None]:
MB_SIZE = 20
GAMMA = 0.95
BURN_IN = 20

In [None]:
q_function = QNetwork([layer], minibatch_size=MB_SIZE)

In [None]:
q_learner = QLearner(q_function,
                    exp_store_size=10000,
                    percept_length=3,
                    n_actions=2,
                    state_stm=1,
                    gamma=GAMMA,
                    minibatch_size=MB_SIZE,
                    prng=np.random.RandomState(12345678))

## MPD-Testclass.

In [None]:
class MDP(object):
    
    def __init__(self, states, actions, transitions, rewards, init_state):
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.current_state = init_state
        
        
    def step(self, action):
        if (self.current_state, action) in self.transitions:
            ts = self.transitions[(self.current_state, action)]
            self.current_state = self.sample_state(ts)
    
    
    def add_state(self, state):
        if not state in self.states:
            self.states.append(state)
        
        
    def add_action(self, action):
        if not action in self.actions:
            self.actions.append(action)
        
        
    def add_transition(self, f, a, ts):
        self.transitions[(f, a)] = ts
    
    
    def add_reward(self, f, a, t, r):
        self.rewards[(f, a, t)] = r
    
    
    def get_reward(self, f, a, t):
        if (f, a, t) in self.rewards:
            return self.rewards[(f, a, t)]
        else:
            return 0
        
        
    def sample_state(self, ts):
        choice = np.random.uniform(0,1)
        i = 0
        while choice > 0:
            choice -= ts[i][1]
            i += 1
        return ts[i-1][0]
            
    

## Generate MDPs. 

In [None]:
# states = ['q0', 'q1']
# actions = ['a0', 'a1']

# mdp = MDP(states, actions, {}, {}, 'q0')


# mdp.add_transition('q0', 'a0', [('q0', 0.8), ('q1', 0.2)])
# mdp.add_transition('q0', 'a1', [('q0', 1.0)])

# mdp.add_transition('q1', 'a0', [('q0', 0.1), ('q1', 0.9)])
# mdp.add_transition('q1', 'a1', [('q0', 1.0)])


# mdp.add_reward('q0', 'a0', 'q0', -1.0)
# mdp.add_reward('q0', 'a1', 'q0',  1.0)
# mdp.add_reward('q1', 'a0', 'q1',  4.0)
# mdp.add_reward('q1', 'a1', 'q0',  5.0)


# state_transl = dict()
# state_transl['q0'] = [1.0, 0.0]
# state_transl['q1'] = [0.0, 1.0]

# action_transl = dict()
# action_transl['a0'] = 0
# action_transl['a1'] = 1

# action_inv_transl = dict((v, k) for k, v in action_transl.iteritems())

In [None]:
states = ['q0', 'q1', 'q2']
actions = ['a0', 'a1']

mdp = MDP(states, actions, {}, {}, 'q0')


mdp.add_transition('q0', 'a0', [('q0', 0.8), ('q1', 0.2)])
mdp.add_transition('q0', 'a1', [('q0', 0.8), ('q1', 0.2)])

mdp.add_transition('q1', 'a0', [('q0', 0.40), ('q1', 0.60)])
mdp.add_transition('q1', 'a1', [('q0', 0.10), ('q2', 0.90)])

mdp.add_transition('q2', 'a0', [('q0', 0.30), ('q2', 0.70)])
mdp.add_transition('q2', 'a1', [('q1', 0.50), ('q2', 0.50)])




mdp.add_reward('q0', 'a0', 'q0', -5.0)
mdp.add_reward('q0', 'a0', 'q1', -5.0)

mdp.add_reward('q0', 'a1', 'q0', -5.0)
mdp.add_reward('q0', 'a1', 'q1', -5.0)

mdp.add_reward('q1', 'a0', 'q0',  2.0)
mdp.add_reward('q1', 'a0', 'q1',  2.0)

mdp.add_reward('q1', 'a1', 'q0',  1.0)
mdp.add_reward('q1', 'a1', 'q2',  0.0)

mdp.add_reward('q2', 'a0', 'q0',  1.0)
mdp.add_reward('q2', 'a0', 'q2', 10.0)

mdp.add_reward('q2', 'a1', 'q1',  0.0)
mdp.add_reward('q2', 'a1', 'q2',  4.0)


state_transl = dict()
state_transl['q0'] = [1.0, 0.0, 0.0]
state_transl['q1'] = [0.0, 1.0, 0.0]
state_transl['q2'] = [0.0, 0.0, 1.0]

action_transl = dict()
action_transl['a0'] = 0
action_transl['a1'] = 1

action_inv_transl = dict((v, k) for k, v in action_transl.iteritems())

## Train Q-Learner. 

In [None]:
def smooth(a, smoothn=10):
    b = [np.mean(a[k-smoothn:k+smoothn]) for k in range(smoothn, len(a)-smoothn)]
    return np.asarray(b)

In [None]:
N_ITERATIONS = 100000
N_TRAIN_ITERATIONS_PER_IT = 4

epsilon_start = 1.0
epsilon_end = 0.00
epsilon_decrease_duration = 90000


costs = []
q_values = []

total_rewards = [0]
action_log = []


alt_learning_rate = 0.01
alt_qs = dict()
for s in states:
    for a in actions:
        alt_qs[(s, a)] = 0.
        

start_time = time.time()
for i in xrange(N_ITERATIONS):
    epsilon = max(epsilon_end, 
                  epsilon_start - 1.0*i*(epsilon_start - epsilon_end)/epsilon_decrease_duration)
    
        
    
    last_state = mdp.current_state
    
    if np.random.uniform(0,1) < epsilon or i < BURN_IN:
        action = np.random.choice(mdp.actions)
    else:
        action_id = q_learner.get_current_best_action()
        action = action_inv_transl[action_id]
        
    mdp.step(action)
    
    previous_reward = mdp.get_reward(last_state, action, mdp.current_state)
    
    total_rewards.append(total_rewards[-1] + previous_reward)
    action_log.append(action)
    
    q_learner.add_observation(state_transl[mdp.current_state], 
                              action_transl[action], previous_reward)
    
    if i >= BURN_IN:
        for j in xrange(N_TRAIN_ITERATIONS_PER_IT):
            cost = q_learner.train_q_function(0.0001)
            costs.append(cost)
    
    #evaluate all Q-values
    q_values.append(np.ndarray.flatten(
            np.asarray([q_learner.q_function.get_q_values(state_transl[s]) for s in states])))
    

    best_next_q = np.max([alt_qs[(mdp.current_state, a)] for a in actions])
    alt_qs[(last_state, action)] = (1. - alt_learning_rate)*alt_qs[(last_state, action)] + \
                                    alt_learning_rate*(previous_reward + GAMMA*best_next_q)
    
end_time = time.time()

print 'Time per 1000 iterations: %f s' % (1000.0*(end_time - start_time) / N_ITERATIONS)
print 'Mean cost: %f' % (np.mean(costs))

## Plot progress. 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('Mean Q-error (smoothed)')
plt.plot(smooth(costs, smoothn=20))
plt.show()


fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('All Q-values')
q_labels = np.ndarray.flatten(np.asarray([s + ';' + a for s in states for a in actions ]))
for qs, lbl in zip(np.transpose(q_values), q_labels):
    ax.plot(qs, label=lbl)
ax.legend()
plt.show()


fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('Total reward')
plt.plot(total_rewards)
plt.show()



In [None]:
print 
for q in states:
    print q + ':', q_learner.q_function.get_q_values(state_transl[q])

In [None]:
for k in alt_qs.iterkeys():
    print k, ':', alt_qs[k]

In [None]:
t = np.arange(4*14).reshape(14,4)
print t
print
print t[[2, 3]]

In [None]:
indices = np.array([2, 3, 5, 9, 7])
state_stm = 3
aug_indices = np.asarray([indices + i 
                                  for i in range(state_stm)]).T


In [None]:
state_stm = 5
exp_counter = 2

print np.append(t[exp_counter-state_stm:], t[:exp_counter])