In [6]:
import numpy as np
from math import sqrt

In [116]:
class FreshReplay:
    def add(self, x):
        self.exp = x
    def sample(self, b, m_e):
        return [self.exp]

In [117]:
class UniformReplay:
    def __init__(self):
        self.exps = []
    def add(self, x):
        self.exps.append(x)
    def sample(self, b, m_e):
        indices = np.random.randint(low=0, high=len(self.exps), size=b)
        return [self.exps[i] for i in indices]

In [118]:
class PrioritisedReplay:
    def __init__(self, alpha=0.7):
        self.exps = []
        self.alpha = 0.7
    def add(self, x):
        self.exps.append((10,x)) # (Error, value)
    def sample(self, b, model_estimate):
        self.exps.sort(reverse=True)
        sampling_distrib = np.array([pow(1/(i+1), self.alpha) for i in range(len(self.exps))])
        sampling_distrib /= np.sum(sampling_distrib)
        indices = np.random.choice([i for i in range(len(self.exps))], p=sampling_distrib, size=b)
        sampled = [self.exps[i][1] for i in indices]
        # Update td error
        for i in indices:
            td_error = abs(self.exps[i][1] - model_estimate)
            current_exp = self.exps[i]
            self.exps[i] = (td_error, current_exp[1])
        return sampled

In [119]:
class Model:
    def __init__(self, lr):
        self.lr = lr
        self.estimate = 0
    def output(self):
        return self.estimate
    def train(self, xs):
        batch_error = 0
        for x in xs:
            batch_error += (self.estimate - x)
        self.estimate -= self.lr * (batch_error / len(xs))

In [152]:
dataset = [1/sqrt(n + 1) for n in range(25)]
timesteps = 10000
t_max_extra = (20) * timesteps

In [153]:
def get_estimates(replay, model_lr=0.01, batch_size=1):

    model = Model(model_lr)

    model_estimates = []
    model_estimate = model.output()
    model_estimates.append(model_estimate)
    for d in dataset:
        replay.add(d)
        for _ in range(timesteps):
            x = replay.sample(batch_size, model_estimate)
            model.train(x)
            model_estimate = model.output()
            model_estimates.append(model_estimate)
            
    for _ in range(t_max_extra):
        x = replay.sample(batch_size, model_estimate)
        model.train(x)
        model_estimate = model.output()
        model_estimates.append(model_estimate)

    return model_estimates

In [154]:
model_lr = 0.1

In [155]:
batch_size = 50

In [None]:
fresh_estimates = get_estimates(FreshReplay(), model_lr, batch_size)
uniform_estimates = get_estimates(UniformReplay(), model_lr, batch_size)
p_estimates = get_estimates(PrioritisedReplay(), model_lr, batch_size)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
plt.figure(figsize=(50,25))
# Ground truth
plt.plot([i*timesteps for i in range(len(dataset))], dataset, color="r", linewidth=4)

plt.plot([i for i in range(len(fresh_estimates))], fresh_estimates, color="b", linewidth=2)

plt.plot([i for i in range(len(uniform_estimates))], uniform_estimates, color="g", linewidth=2)

plt.plot([i for i in range(len(p_estimates))], p_estimates, color="purple", linewidth=2)