In [None]:
import gym
import itertools
import numpy as np
import sklearn.preprocessing
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler

DISCOUNT_FACTOR = 1.0


class Estimator:
    def __init__(self, env):
        self.models = []
        for _ in range(env.action_space.n):
            model = SGDRegressor(learning_rate="constant")
            model.partial_fit([self.featurize_state(env.reset())], [0])
            self.models.append(model)
            
    def featurize_state(self, state):
        scaled = scaler.transform([state])
        featurized = featurizer.transform(scaled)
        return featurized[0]
    
    def predict(self, s, a=None):
        s = self.featurize_state(s)
        if a is None:
            return [model.predict([s])[0] for model in self.models]
        else:
            return self.models[a].predict([s])[0]
    
    def update(self, s, a, y):
        s = self.featurize_state(s)
        self.models[a].partial_fit([s], [y])

class Agent:
    def __init__(self, env):
        self.env = env
    
    def policy(self, estimator, state, epsilon=0.0):
        action_probs = np.ones(self.env.action_space.n, dtype=float) * epsilon / self.env.action_space.n
        action_probs[np.argmax(estimator.predict(state))] += (1.0 - epsilon)
        action = np.random.choice(self.env.action_space.n, p=action_probs)
        return action
    
    def q_learning(self, estimator, num_episodes=100):
        round_num = []
        for ith in range(1, num_episodes + 1):
            print("\rEpisode {}/{}.".format(ith, num_episodes))
            print(np.mean(round_num))
                
            state = self.env.reset()
            
            for t in itertools.count():
                action = self.policy(estimator, state)
                next_state, reward, done, _ = self.env.step(action)
                
                # print (action, next_state)
                               
                q_values_next = estimator.predict(next_state)
                td_target = reward + DISCOUNT_FACTOR * np.max(q_values_next)
                
                estimator.update(state, action, td_target)
                
                if done:
                    round_num.append(t)
                    break
                
                state = next_state
                
        return round_num


# state = [position in [-1.2, -0.6], velocity in [-0.07, 0.07]]
env = gym.envs.make("MountainCar-v0")
observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(observation_examples))

agent = Agent(env)
estimator = Estimator(env)
round_num = agent.q_learning(estimator)