In [1]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import sklearn.pipeline
import sklearn.preprocessing
import env.faenv
import math

import plotting
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler

matplotlib.style.use('ggplot')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
env = gym.make('FA_Env-v0')

LOADING DATA...
BUILDING NETWORK...
Data Dimension:  44
ADDING REGULARIZATION...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               23040     
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_3 (Activation)    (None, 128)               0         
_____________________________________________________________

  Dense(64, W_regularizer=regularizers.l2(0.01)))



_________________________________________________________________
activation_5 (Activation)    (None, 3)                 0         
Total params: 113,667
Trainable params: 113,667
Non-trainable params: 0
_________________________________________________________________
None


In [3]:
class Estimator():
    """
    Value Function approximator. 
    """
    def __init__(self):
        self.model = SGDRegressor(learning_rate="constant")
        self.model.partial_fit([self.feature_extractor(env.reset(), 0, 0)], [0]) # initialize to avoid error
        
        
    def featurize(self, features):
        return np.array(features)
    

    def feature_extractor(self, state, bet_amount, bet_team):
        """
        Returns the featurized representation for a state.
        """        
        match, predictions, odds, cash = state
        features = [bet_amount, cash, odds[0], odds[1], odds[2], predictions[0], predictions[1], predictions[2]]
        return self.featurize(features)
    
    def predict(self, s):
        """
        Makes value function predictions.
        
        Args:
            s: state to make a prediction for
            a: (Optional) action to make a prediction for
            
        Returns
            If an action a is given this returns a single number as the prediction.
            If no action is given this returns a vector or predictions for all actions
            in the environment where pred[i] is the prediction for action i.
        """
        match, predictions, odds, cash = s
        predictions = np.zeros((3, math.floor(cash+1)))
        for bet_amount in np.arange(cash + 1):
            for bet_team in range(3):
                features = self.feature_extractor(s, bet_amount, bet_team)
                prediction = self.model.predict([features])[0]
                predictions[bet_team, int(bet_amount)] = prediction
        
        return predictions

    def update(self, s, a, y):
        """
        Updates the estimator parameters for a given state and action towards
        the target y.
        """
        bet_amount, bet_team = a
        features = self.feature_extractor(s, bet_amount, bet_team)
        self.model.partial_fit([features], [y])

In [4]:
def make_epsilon_greedy_policy(estimator, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
    
    Args:
        estimator: An estimator that returns q values for a given state
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / (nA[0] * nA[1])
        q_values = estimator.predict(observation)
        best_action = np.argwhere(A.max() == A)[0]
        print(best_action)
        A[best_action[0], best_action[1]] += (1.0 - epsilon)
        return A
    return policy_fn

In [5]:
def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    for i_episode in range(num_episodes):
        
        # The policy we're following
        bet_amount_space, bet_team_space = env.action_space.spaces
        policy = make_epsilon_greedy_policy(
            estimator, epsilon * epsilon_decay**i_episode, (int(bet_amount_space.n), int(bet_team_space.n)))
        
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        
        # Reset the environment and pick the first action
        state = env.reset()
        
        # Only used for SARSA, not Q-Learning
        next_action = None
        
        # One step in the environment
        for t in itertools.count():
                        
            # Choose an action to take
            # If we're using SARSA we already decided in the previous step
            if next_action is None:
                action_probs = policy(state)
                print(np.sum(action_probs))
                action = np.random.choice(np.arange(action_probs.size), p=action_probs.ravel())
                action = np.unravel_index(action, dims = action_probs.shape)
                
            else:
                action = next_action
            
            # Take a step
            next_state, reward, done, _ = env.step(action)
    
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            q_values_next = estimator.predict(next_state)
            
            # Use this code for Q-Learning
            # Q-Value TD Target
            td_target = reward + discount_factor * np.max(q_values_next)
            
            # Use this code for SARSA TD Target for on policy-training:
            # next_action_probs = policy(next_state)
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)             
            # td_target = reward + discount_factor * q_values_next[next_action]
            
            # Update the function approximator using our target
            estimator.update(state, action, td_target)
            
            if i_episode % 10 == 0 and t % 10 == 0:
                print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, last_reward), end="")
                
            if done:
                break
                
            state = next_state
    
    return stats

In [None]:
estimator = Estimator()



In [None]:
# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 100, epsilon=0.0)



[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
Step 0 @ Episode 1/100 (0.0)[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
Step 10 @ Episode 1/100 (0.0)[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))
(0, 0)
[0 0]
1.0
Tuple(Discrete(51), Discrete(3))


In [38]:
# plotting.plot_cost_to_go_mountain_car(env, estimator)
# plotting.plot_episode_stats(stats, smoothing_window=25)