In [None]:
import gym
import itertools
import numpy as np
import sklearn.preprocessing
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor


DISCOUNT_FACTOR = 0.9


class Estimator:
    def __init__(self, env):
        self.models = []
        for _ in range(env.action_space.n):
            model = SGDRegressor()
            model.partial_fit([env.reset()], [0])
            self.models.append(model)
    
    def predict(self, s, a=None):
        if a is None:
            return [model.predict([s])[0] for model in self.models]
        else:
            return self.models[a].predict([s])[0]
    
    def update(self, s, a, y):
        self.models[a].partial_fit([s], [y])

class Agent:
    def __init__(self, env):
        self.env = env
    
    def policy(self, estimator, state, epsilon=0.01):
        action_probs = np.ones(self.env.action_space.n, dtype=float) * epsilon / self.env.action_space.n
        action_probs[np.argmax(estimator.predict(state))] += (1.0 - epsilon)
        action = np.random.choice(self.env.action_space.n, p=action_probs)
        return action
    
    def q_learning(self, estimator, num_episodes=1000, alpha=0.5):
        round_num = []
        for ith in range(1, num_episodes + 1):
            if ith % 100 == 0:
                print("\rEpisode {}/{}.".format(ith, num_episodes))
                print(np.mean(round_num))
                
            state = env.reset()
            
            for t in itertools.count():
                action = self.policy(estimator, state)
                next_state, reward, done, _ = env.step(action)
                
                print (action, next_state)
                               
                q_values_next = estimator.predict(next_state)
                td_target = reward + DISCOUNT_FACTOR * np.max(q_values_next)
                
                estimator.update(state, action, td_target)
                
                if done:
                    round_num.append(t)
                    break
                
                state = next_state
                
        return round_num



# state = [position in [-1.2, -0.6], velocity in [-0.07, 0.07]]


env = gym.envs.make("MountainCar-v0")
agent = Agent(env)
estimator = Estimator(env)
round_num = agent.q_learning(estimator)
            
