## Step 1: Define States and Actions

States are: Dead, weight = 30kg, 40kg .... 120kg

Actions are: eating meat (action 0) eating vegetables (action 1)

In [1]:
from enum import Enum
class State(Enum):
    DEAD = 0
    W30 = 1
    W40 = 2
    W50 = 3
    W60 = 4
    W70 = 5
    W80 = 6
    W90 = 7
    W100 = 8
    W110 = 9
    W120 = 10
numberOfStates = 11

# for actions assume action 0 is eating meat and action 1 is eating vegetables

## Step 2:  Define Health Reward Function

health reward function

In [2]:
def get_health_value(state):
    if state == State.DEAD:
        return 0
    if state == State.W50:
        return 50.0
    if state == State.W60:
        return 100.0
    if state == State.W70 or state == State.W80 or state == State.W90:
        return 10.0
    return 0

social reward function

In [3]:
def get_social_value(state):
    if state == State.DEAD:
        return 0
    if state == State.W40 or state == State.W50:
        return 100.0
    if state == State.W60:
        return 50.0
    return 0

total reward function

In [4]:
def get_total_reward(state, w1=1, w2=1):
    return w1 * get_health_value(state) + w2 * get_social_value(state)

## Value Iterations (值迭代）
Initialize variables

Feel free to play with these variables with different values

In [5]:
maxIter = 100 # maximal number of iterations to run
epsilon = 1e-5 # convergence threshold
gamma = 0.5  # decaying parameter that discounts future reward
w1 = 1   # weight for health value
w2 = 1   # weight for social value

Find best action to take at each iteration and the reward it generated

The first return value is the action \
The second return value is the updated value

In [6]:
def find_best_action(stateValues, index, w1, w2):
    state = State(index)
    if state == State.DEAD:
        return (0, gamma * stateValues[index])
    if state == State.W30:
        valueRou = get_total_reward(State(index+1), w1, w2)
        return (0, valueRou + gamma * stateValues[index+1])
    if state == State.W120:
        valueCai =  get_total_reward(State(index-1), w1, w2)
        return (1, valueCai + gamma * stateValues[index-1])
    valueCai = get_total_reward(State(index-1), w1, w2);
    valueRou = get_total_reward(State(index+1), w1, w2);
    if valueCai < valueRou:
        return (0, valueRou + gamma * stateValues[index+1])
    else:
        return (1, valueCai + gamma * stateValues[index-1])

Value iterations

In [7]:
import numpy as np

# define error functions
def get_value_diff(oldValues, newValues):
    err = 0.0
    for index in range(len(oldValues)):
        if index < len(newValues):
            err = max(abs(newValues[index]-oldValues[index]), err)
        else:
            return 10000.0
    return err
            

def do_value_iterations():
    err = 10000.0 
    iter_i = 0
    stateValues = np.zeros(numberOfStates)
    policy = np.zeros(numberOfStates)
    while err > epsilon and iter_i < maxIter:
        iter_i += 1
        newStateValues = np.zeros(numberOfStates)
        for index in range(numberOfStates):
            (policy[index], newStateValues[index]) = find_best_action(stateValues, index, w1, w2)
        err = get_value_diff(stateValues, newStateValues)
        stateValues = newStateValues
        print(iter_i)
        print(stateValues)
        print(policy)
        
do_value_iterations()
        

1
[  0. 100. 150. 150. 150. 150.  10.  10.  10.   0.   0.]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
2
[  0. 175. 225. 225. 225. 225.  85.  15.  15.   5.   0.]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
3
[  0.  212.5 262.5 262.5 262.5 262.5 122.5  52.5  17.5   7.5   2.5]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
4
[  0.   231.25 281.25 281.25 281.25 281.25 141.25  71.25  36.25   8.75
   3.75]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
5
[  0.    240.625 290.625 290.625 290.625 290.625 150.625  80.625  45.625
  18.125   4.375]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
6
[  0.     245.3125 295.3125 295.3125 295.3125 295.3125 155.3125  85.3125
  50.3125  22.8125   9.0625]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
7
[  0.      247.65625 297.65625 297.65625 297.65625 297.65625 157.65625
  87.65625  52.65625  25.15625  11.40625]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
8
[  0.       248.828125 298.828125 298.828125 298.828125 298.828125
 158.828125  88.828125  53.828125  26.328125  12.578125]
[0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
9
[  0.     

## Policy iterations (策略迭代）

policy iteration has two main part
1. Policy evaluation
2. Pollcy improvement

update_value_given_action is used for policy evaluation, in which the policy / action is known


In [8]:
def evaluate_value_given_action(index, action, stateValues, w1, w2):
    if action == 0:
        # rou
        state = State(index)
        if state == State.DEAD:
            return stateValues[index] * gamma
        if state == State.W120:
            return 0
        reward = get_total_reward(State(index+1), w1, w2)
        return reward + stateValues[index+1] * gamma
    else:
        state = State(index)
        if state == State.DEAD:
            return stateValues[index] * gamma
        if state == State.W30:
            return 0
        reward = get_total_reward(State(index-1), w1, w2)
        return reward + stateValues[index-1] * gamma


policy evaluations

In [25]:
def policy_evaluation(policy, w1, w2):
    err = 10000.0 
    iter_i = 0
    stateValues = np.zeros(numberOfStates)
    while iter_i < maxIter and err > epsilon:
        iter_i += 1
        newValues = np.zeros(numberOfStates)
        for index in range(numberOfStates):
            action = policy[index]
            newValues[index] = evaluate_value_given_action(index, action, stateValues, w1, w2)
        err =  get_value_diff(newValues, stateValues)
        stateValues = newValues
    return stateValues 

Policy improvement

In [29]:
def policy_improvement(stateValues, policy):
    newPolicy = np.zeros(len(stateValues))
    for index in range(len(stateValues)):
        state = State(index)
        if state == State.DEAD:
            newPolicy[index] = policy[index]
        elif state == State.W30:
            newPolicy[index] = 0 
        elif state == State.W120:
            newPolicy[index] = 1
        else:
            rewardCai = get_total_reward(State(index-1), w1, w2) + gamma * stateValues[index-1]
            rewardRou = get_total_reward(State(index+1), w1, w2) + gamma * stateValues[index+1]
            if rewardCai < rewardRou:
                newPolicy[index] = 0
            else:
                newPolicy[index] = 1
    return newPolicy

Main iterations that alternate between policy evaluation and police improvement

In [30]:
def policy_iterations():
    err = 10000.0
    iter_i = 0
    policy = np.ones(numberOfStates)
    while err > epsilon and iter_i < maxIter:
        stateValues = policy_evaluation(policy, w1, w2)
        print ("iteration", iter_i)
        print(stateValues)
        newPolicy = policy_improvement(stateValues, policy)
        print(policy)
        err = get_value_diff(policy, newPolicy)
        policy = newPolicy
        

In [31]:
policy_iterations()

iteration 0
[  0.       0.       0.     100.     200.     250.     135.      77.5
  48.75    24.375   12.1875]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
iteration 0
[  0.         249.99999106 299.99999106 299.99999106 299.99999106
 299.99999106 159.99999106  89.99999106  54.99999106  27.49999106
  13.74999106]
[1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
