<h1>Temporal Difference model</h1>

<h3>Importations</h3>

In [1]:
from matplotlib import pyplot as plt
from random import *
import numpy as np

##from __future__ import division

from bokeh.core.properties import Any, Dict, Instance, String
from bokeh.models import ColumnDataSource, Div, Column,  LayoutDOM
from bokeh.plotting import figure, output_file, show
from bokeh.io import show, output_notebook


<h3>Discretisation</h3>

In [2]:
T = 5 
N = 22 # Time steps occurrence
h = T/N # Time step scale

<h3>Parameters</h3>

In [3]:
gamma = 0.98 # Discount factor
alpha = 0.005 # Learning rate
lamb = 0.9 # Eligibility trace parameter
trials = 400 # 

<h3>Variables initialization</h3>

In [4]:
k = 2 # Stimuli occurrence. k = 1 for 0 stimulus too.
x = [[0 for t in range(N)] for i in range(k)] # State vectors of the stimuli, between 0.5 to 2 seconds
w = [[0 for t in range(N)] for i in range(k)] # Weights vector per stimulus
r = [0 for t in range(N)] # Reward

pl = [[0 for t in range(N)] for i in range(k)] # Reward predictions
P = [0 for t in range(N)] # Total reward prediction

TD = [0 for t in range(N)] # Temporal Difference
delta = [0 for t in range(N)] # Prediction error
e = [[0 for t in range(N)] for i in range(k)] # Eligibility trace
delta_w = [[0 for t in range(N)] for i in range(k)] # The last time step is ignored

<h3> Learning progress </h3>

In [5]:
for j in range(trials):
    x = [[0 for t in range(N)] for i in range(k)]
    r = [0 for t in range(N)]
    e = [[0 for t in range(N)] for i in range(k)]

    s = [5, 15]
    r[20] = 1
    q = [0, 0]
    
    for t in range(N):
        # First stimulus
        if t >= 1: # ==0 at t=0, either way it isn't an N-sized list
            e[0] = np.multiply(e[0],lamb)
            e[0] = np.add(e[0],x[0])

        if len(s)!=0 and t == s[0]:
            x[0][q[0]] = 1
        if len(s)!=0 and 1 in x[0] and t > s[0]:
            try:
                x[0][q[0]] = 0
                x[0][q[0]+1] = 1
                q[0] += 1
            except: pass
        pl[0] = np.multiply(x[0],w[0]) #reward predictions
        P[t] = np.sum(pl[0])
        
        # Second stimulus
        if k == 2: #if there are two stimuli
            e[1] = np.multiply(e[1],lamb)
            e[1] = np.add(e[1],x[1])
            if len(s)!=0 and t == s[1]:
                x[1][q[1]] = 1        
            if len(s)!=0 and 1 in x[1] and t > s[1]:
                try:
                    x[1][q[1]] = 0
                    x[1][q[1]+1] = 1
                    q[1] += 1
                except: pass
            pl[1] = np.multiply(x[1],w[1])
            P[t] += np.sum(pl[1])
            
        if t >= 1: # ==0 at t=0, either way it isn't an N-sized list
            TD[t] =  P[t-1] - gamma * P[t] # <0 when predicts a reward at time step t+1
        
        delta[t] = r[t] - TD[t]
        delta_w[0] = np.multiply(alpha * delta[t], e[0])
        w[0] = np.add(w[0], delta_w[0])
        if k == 2:
            delta_w[1] = np.multiply(alpha * delta[t], e[1])
            w[1] = np.add(w[1], delta_w[1])

<h4>Plots</h4>

In [6]:
axisx = np.arange(0,T,h)
output_notebook()
p = figure(title="TD(0) Model", x_axis_label="Time step", y_axis_label="Prediction error", y_range=[0, 1.5], plot_width=400, plot_height=300)
p.line(axisx, delta, line_width=4)
p.circle(20*h, 1, size=4, line_color="red", fill_color="red")
if len(s)!=0:
    p.circle(s[0]*h, 1, size=4, line_color="#56BA1B", fill_color="#56BA1B")
if len(s)!=0 and k == 2:
    p.circle(s[1]*h, 1, size=4, line_color="#56BA1B", fill_color="#56BA1B")
show(p)