In [None]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import sklearn.pipeline
import sklearn.preprocessing
import env.faenv
import math

import plotting
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVR

matplotlib.style.use('ggplot')

In [None]:
env = gym.make('FA_Env-v0')

In [None]:
class Estimator():
    """
    Value Function approximator. 
    """
    def __init__(self):
        self.model = SGDRegressor(learning_rate="constant")
        self.model.partial_fit([self.feature_extractor(env.reset(), 0, 0)], [0]) # initialize to avoid error
        
        
    def featurize(self, features):
        return np.array(features)
    

    def feature_extractor(self, state, bet_amount, bet_team):
        """
        Returns the featurized representation for a state.
        """
        predictions, odds, cash = state
        features = [bet_amount, cash, odds[0], odds[1], odds[2], predictions[0], predictions[1], predictions[2]]
        return self.featurize(features)
    
    def predict(self, s):
        """
        Makes value function predictions.
        
        Args:
            s: state to make a prediction for
            a: (Optional) action to make a prediction for
            
        Returns
            If an action a is given this returns a single number as the prediction.
            If no action is given this returns a vector or predictions for all actions
            in the environment where pred[i] is the prediction for action i.
        """
        model_pred, odds, cash = s
        predictions = np.zeros((math.floor(cash+1), 3))
        for bet_amount in np.arange(math.floor(cash + 1 - 1e-10)):
            for bet_team in range(3):
                features = self.feature_extractor(s, bet_amount, bet_team)
                prediction = self.model.predict([features])[0]
                predictions[int(bet_amount), bet_team] = prediction
        
        return predictions

    def update(self, s, a, y):
        """
        Updates the estimator parameters for a given state and action towards
        the target y.
        """
        bet_amount, bet_team = a
        features = self.feature_extractor(s, bet_amount, bet_team)
        self.model.partial_fit([features], [y])

In [None]:
def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    for i_episode in range(num_episodes):
                
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        
        # Reset the environment and pick the first action
        state = env.reset()
        
        # Only used for SARSA, not Q-Learning
        next_action = None
        
        # One step in the environment
        for t in itertools.count():

            sys.stdout.flush()
            match_preds, match_odds, cash = state
            if np.random.rand() < epsilon:
                action = (np.random.randint(cash + 1), np.random.randint(3))
            else:
                q_values = estimator.predict(state)
                best = np.argwhere(q_values.max() == q_values)
                chosen = best[np.random.randint(best.shape[0])]
                action = (chosen[0], chosen[1])
                            
            # Take a step
            next_state, reward, done, _ = env.step(action)
            
#             print('cash:', cash)
#             print('action:', action)
#             print('reward:', reward)
    
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            q_values_next = estimator.predict(next_state)
            
            # Use this code for Q-Learning
            # Q-Value TD Target
            td_target = reward + discount_factor * np.max(q_values_next)
            
            # Update the function approximator using our target
            estimator.update(state, action, td_target)
            
            print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, last_reward), end="")
                
            if done:
                break
                
            state = next_state
    
    return stats

In [None]:
estimator = Estimator()

In [None]:
# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 200, epsilon=0.3)

In [None]:
# plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)