In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 10.0)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
import q_learner
from q_learner import QLearner, QNetwork
from labeling_network import FullyConnectedLayer, linear

In [None]:
layer = FullyConnectedLayer(2, 2, activation_fn=linear)

In [None]:
MB_SIZE = 20

In [None]:
q_function = QNetwork([layer], minibatch_size=MB_SIZE)

In [None]:
q_learner = QLearner(q_function,
                    exp_store_size=10000,
                    percept_length=2,
                    n_actions=2,
                    state_stm=1,
                    gamma=0.90,
                    minibatch_size=MB_SIZE,
                    prng=np.random.RandomState(1234))

## MPD-Testclass.

In [None]:
class MDP(object):
    
    def __init__(self, states, actions, transitions, rewards, init_state):
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.current_state = init_state
        
        
    def step(self, action):
        if (self.current_state, action) in self.transitions:
            ts = self.transitions[(self.current_state, action)]
            self.current_state = self.sample_state(ts)
    
    
    def add_state(self, state):
        if not state in self.states:
            self.states.append(state)
        
        
    def add_action(self, action):
        if not action in self.actions:
            self.actions.append(action)
        
        
    def add_transition(self, f, a, ts):
        self.transitions[(f, a)] = ts
    
    
    def add_reward(self, f, a, t, r):
        self.rewards[(f, a, t)] = r
    
    
    def get_reward(self, f, a, t):
        if (f, a, t) in self.rewards:
            return self.rewards[(f, a, t)]
        else:
            return 0
        
        
    def sample_state(self, ts):
        choice = np.random.uniform(0,1)
        i = 0
        while choice > 0:
            choice -= ts[i][1]
            i += 1
        return ts[i-1][0]
            
    

## Generate MDPs. 

In [None]:
states = ['q0', 'q1']
actions = ['a0', 'a1']

mdp = MDP(states, actions, {}, {}, 'q0')


mdp.add_transition('q0', 'a0', [('q0', 0.8), ('q1', 0.2)])
mdp.add_transition('q0', 'a1', [('q0', 1.0)])

mdp.add_transition('q1', 'a0', [('q0', 0.1), ('q1', 0.9)])
mdp.add_transition('q1', 'a1', [('q0', 1.0)])


mdp.add_reward('q0', 'a0', 'q0', -1.0)
mdp.add_reward('q0', 'a1', 'q0',  1.0)
mdp.add_reward('q1', 'a0', 'q1',  4.0)
mdp.add_reward('q1', 'a1', 'q0',  5.0)


state_transl = dict()
state_transl['q0'] = [1.0, 0.0]
state_transl['q1'] = [0.0, 1.0]

action_transl = dict()
action_transl['a0'] = 0
action_transl['a1'] = 1

## Train Q-Learner. 

In [None]:
def smooth(a, smoothn=10):
    b = [np.mean(a[k-smoothn:k+smoothn]) for k in range(smoothn, len(a)-smoothn)]
    return np.asarray(b)

In [None]:
N_ITERATIONS = 50000



costs = []
q_values = []


start_time = time.time()
for i in range(N_ITERATIONS):
    last_state = mdp.current_state

    action = np.random.choice(mdp.actions)
    mdp.step(action)
    
    previous_reward = mdp.get_reward(last_state, action, mdp.current_state)
    
    q_learner.add_observation(state_transl[mdp.current_state], 
                              action_transl[action], previous_reward)
    
    cost = q_learner.train_q_function(0.01)
    costs.append(cost)
    
    #evaluate all Q-values
    q_values.append(np.ndarray.flatten(
            np.asarray([q_learner.q_function.get_q_values(state_transl[s]) for s in states])))
    
end_time = time.time()

print 'Time per 1000 iterations: %f s' % (1000.0*(end_time - start_time) / N_ITERATIONS)
print 'Mean cost: %f' % (np.mean(costs))

## Plot progress. 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('Mean Q-error (smoothed)')
plt.plot(smooth(costs, smoothn=20))
plt.show()


fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('All Q-values')
q_labels = np.ndarray.flatten(np.asarray([s + ';' + a for s in states for a in actions ]))
for qs, lbl in zip(np.transpose(q_values), q_labels):
    ax.plot(qs, label=lbl)
ax.legend()
plt.show()

In [None]:
print 
for q in states:
    print q + ':', q_learner.q_function.get_q_values(state_transl[q])