# Q-learning tryouts on Santa's uncertain bags

## States 

A state is characterized by a matrix of size `(N_BAGS, N_TYPES)`. For example, `s[0,:]=[1,0,1,0,0,0,0,0,0]`. The initial state is when the matrix is null or a customly defined. Terminal states are defined by state's score. 

How many state there are? There are at most `N_BAGS * 9^10` states.


## Actions

Action is to add a toy following the list of available toys.


## Rewards

Action reward can be defined by the score of the bag where a toy has been added.


## Q-learning: Off-Policy Temporal Difference Control

In this algorithm we estimate action-value function $Q(s,a)$ as :
$$
Q(S_t,A_t) \leftarrow Q(S_t,A_t) + \alpha \left[ R_{t+1} + \gamma \max_{a} Q(S_{t+1}, a) - Q(S_t,A_t) \right], \, Q(\cal{S}^{+},a)=0
$$

**Algorithm**
<br>
<div style="background-color: #aaaaaa; padding: 10px; width: 75%; border: solid black; border-radius: 5px;">

    Initialize $Q(s, a)$, for all $s \in \cal{S}$, $a \in \cal{A}(s)$, arbitrarily, and $Q(\text{terminal-state}, \cdot) = 0$<br>
    Repeat (for each episode):<br>
    &emsp;Initialize $S$<br>
    &emsp;Choose $A$ from $S$ using policy derived from $Q$ (e.g., $\epsilon$-greedy)<br>
    &emsp;Repeat (for each step of episode):<br>
    &emsp;&emsp;Take action $A$, observe $R$, $S'$<br>
    &emsp;&emsp;$Q(S,A) \leftarrow Q(S,A) + \alpha \left[ R + \gamma \max_{a}Q(S', a) - Q(S,A) \right]$<br>
    &emsp;&emsp;$S \leftarrow S'; \, A \leftarrow A';$<br>
    &emsp;until $S$ is terminal
</div>

In [2]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [3]:
from time import time
from copy import deepcopy
import numpy as np
np.random.seed(2016)

import logging
logging.getLogger().setLevel(logging.DEBUG)

In [4]:
import sys
sys.path.append('../common')
from utils import weight3 as weight_fn, weight_by_index
from utils import bag_weight, score
from utils import MAX_WEIGHT, AVAILABLE_GIFTS, GIFT_TYPES, N_TYPES, N_BAGS

In [26]:
REJECTED_BAGS_THRESHOLD = 0.1
NEGATIVE_REWARD = -1000
POSITIVE_REWARD = 1000

def step_reward(rejected):    
    return 0.0 if rejected < REJECTED_BAGS_THRESHOLD else -rejected*10

In [14]:
initial_state = np.zeros((N_BAGS, N_TYPES), dtype=np.uint8)
alpha = 0.72
goal_weight = MAX_WEIGHT * N_BAGS * alpha

print goal_weight

36000.0


In [15]:
score(initial_state)

0.0

In [11]:
example_action = (0, 0)

In [21]:
def take_action(state, action):
    new_state = state.copy()
    new_state[action[0], action[1]] += 1
    return new_state

def is_available(state, available_gifts, gift_types=GIFT_TYPES):
    sum_gifts = np.sum(np.array(state), axis=0)
    for v, gift_type in zip(sum_gifts, gift_types):
        if available_gifts[gift_type] - v < 0:
            return False
    return True

def update_available_gifts(available_gifts, state, gift_types=GIFT_TYPES):
    sum_gifts = np.sum(np.array(state), axis=0)
    for v, gift_type in zip(sum_gifts, gift_types):
        assert available_gifts[gift_type] - v >= 0, "Found state is not available : {}, {}".format(state, available_gifts)
        available_gifts[gift_type] = available_gifts[gift_type] - v

In [40]:
new_state = take_action(initial_state, example_action)
new_score, rejected = score(new_state, return_rejected=True)
print new_state, new_score, is_available(new_state, AVAILABLE_GIFTS), step_reward(rejected)

[[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]] 1.91990057321 True 0.0
(15, 6)


In [39]:
from collections import defaultdict

def state_to_str(state):
    return state.__str__()

In [45]:
heapq??

In [43]:
import heapq

heap = []

heapq.heappush(heap, [(0, 0), 1])
heapq.heappush(heap, [(0, 1), 10])
heapq.heappush(heap, [(12, 5), 3])


# sorted([[(0, 0), 1], [(0, 1), 10], [(12, 5), 3]], key=itemgetter(1), reverse=True)

In [44]:
print heap

[[(0, 0), 1], [(0, 1), 10], [(12, 5), 3]]


In [37]:
from operator import itemgetter

def get_policy_action(state, action_value_function, epsilon=0.1):
    state_key = state_to_str(state)
    u = np.random.rand()
    if state_key in action_value_function and u > epsilon:
        
        actions_values = action_value_function[state_key]
        actions_values = sorted(actions_values, key=itemgetter(1), reverse=True)
        return actions_values[0][0]    
    else:
        # Arbitrary initialization
        bag_id = np.random.randint(N_BAGS)
        toy_id = np.random.randint(N_TYPES)
        action = (bag_id, toy_id)
        value = np.random.rand()
        action_value_function[state_key].append([action, value])
        return action

In [20]:
def q_learning(goal_weight, 
               available_gifts,
               initial_state=None
               n_episodes=10, alpha=0.75, gamma=0.7, epsilon=0.001, action_value_function=None):
    
    for i in range(n_episodes):

        episode_length = N_BAGS * N_TYPES * 10
        state = np.zeros((N_BAGS, N_TYPES)) if initial_state is None else initial_state        
        action = get_policy_action(state, action_value_function, epsilon=epsilon)
        state_score = score(state)
        is_terminal = state_score > goal_weight and is_available(state, available_gifts)
        while not is_terminal:
            
            episode_length -= 1 
            if episode_length < 0:
                logging.warn('Episode length is reached, but state score is still : %f / %f' % (state_score, goal_weight))
                break
            
            #print "state, action : ", state, action
            current_reward = 0 
            new_state = take_action(state, action)
            new_score, rejected = score(new_state, return_rejected=True)
            
            if not is_available(new_state, available_gifts):                
                current_reward = NEGATIVE_REWARD
                is_terminal = True
            elif new_score >= goal_weight:
                current_reward = POSITIVE_REWARD
                is_terminal = True
            elif new_score < goal_weight:
                current_reward = step_reward(rejected)
            else:
                raise Exception("Unclassified state: {}, score={}".format(new_state, new_score))
                            
            # Update Q(s,a)
            actions_values = 
            v = action_value_function[y, x, action_index]
            nv = np.max(action_value_function[ny, nx, :])
            t = alpha * (current_reward + gamma * nv - v) 
            action_value_function[y, x, action_index] += t            
            
            state = new_state
            action = get_policy_action(state, action_value_function, epsilon=epsilon)                        
                
    return policy, action_value_function

SyntaxError: invalid syntax (<ipython-input-20-a1b52de73bb4>, line 4)