### Car Rental

1. 문제: 두 군데 장소에서 자동차 렌트 서비스를 한다고 했을 때 각 장소에 차의 대수를 얼마로 유지하는게 좋을지 푸는 문제이다. 
2. 조건
    - state: A,B 장소 별 자동차 대수
    - action: 자동차를 최대 5대까지 옮길 수 있다. (단 방향으로)
    - reward: 자동차를 렌트해줄 경우 \$10, 자동차 장소를 옮길 때 -\$2
    - 진행 순서: 
        - 자동차를 옮긴다. (action)
        - 자동차 렌트 요청이 온다. 
        - 자동차 return을 받는다. 
3. 학습 목표
    - policy iteration  

고려 사항
- rental 가능한 차의 대수는 최대 10대이다. 각 장소 당.
- 교환은 단 방향으로만 진행된다. 


깨달음
- episodic, terminal이 있는 경우에는 초기값을 `env.reset()`으로 가져오고 `env` 내에 state 저장하면서 `env.step(action)` 으로 이 후 값들에 대해서 가져오기 가능
- episodic이 아니고 DP의 경우에는 table 값이 전부다 주어져있다고 생각하고 진행하니까 table을 구하고 시작해야 함. 

In [1]:
import numpy as np
from math import factorial, exp
from collections import defaultdict
from itertools import count

In [2]:
# 문제에 대한 정보들
MAX_CARS = 20
MAX_MOVE_OF_CARS = 5
# for lambda 
A_RENTAL_REQUEST = 3
B_RENTAL_REQUEST = 4
A_RENTAL_RETURN = 3
B_RENTAL_RETURN = 2
DISCOUNT = 0.9
RENTAL_CREDIT = 10
MOVE_CAR_COST = 2
actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1)

POISSON_UP_BOUND = 11

In [3]:
poissonBackup = dict()
def poisson(n, lam):
    global poissonBackup
    key = n * 10 + lam
    if key not in poissonBackup.keys():
        poissonBackup[key] = exp(-lam) * pow(lam, n) / factorial(n)
    return poissonBackup[key]

In [4]:
# action: -면 B에서 A로, +면 A에서 B로, action은 지금 state보다 클 수 없다. (뒤에서 구현)

class CarRental:
    def __init__(self, constant_return=True):
        self.P = defaultdict(lambda: [[] for i in range(MAX_MOVE_OF_CARS*2 + 1)])
        self.constant_return = True
        
    def step(self, state, action):
        a, b = state
        if self.P[state][action]:
            return self.P[state][action]
        else:
            for a_rental_request in range(POISSON_UP_BOUND):
                for b_rental_request in range(POISSON_UP_BOUND):
                    # moving cars
                    num_a_cars = int(min((a - action), MAX_CARS))
                    num_b_cars = int(min((b + action), MAX_CARS))

                    # rental request
                    real_a_rental_request = min(a_rental_request, num_a_cars)
                    real_b_rental_request = min(b_rental_request, num_b_cars)

                    # rental request prob (based on # of requested cars)
                    a_rental_request_prob = poisson(a_rental_request, A_RENTAL_REQUEST) 
                    b_rental_request_prob = poisson(b_rental_request, B_RENTAL_REQUEST)

                    prob = a_rental_request_prob * b_rental_request_prob
                    reward = (a_rental_request + b_rental_request) * RENTAL_CREDIT - abs(action) * MOVE_CAR_COST
                    num_a_cars -= real_a_rental_request 
                    num_b_cars -= real_b_rental_request
                    prob_ = prob
                    
                    if self.constant_return:
                        a_rental_return = 3
                        b_rental_return = 2
                        next_a = min(num_a_cars + a_rental_return, MAX_CARS)
                        next_b = min(num_b_cars + b_rental_return, MAX_CARS)                        
                    else:
                        # rental return 
                        for a_rental_return in range(POISSON_UP_BOUND):
                            for b_rental_return in range(POISSON_UP_BOUND):                                                
                                a_rental_return_prob = poisson(a_rental_return, A_RENTAL_RETURN)
                                b_rental_return_prob = poisson(b_rental_return, B_RENTAL_RETURN)
                                next_a = min(num_a_cars + a_rental_return, MAX_CARS)
                                next_b = min(num_b_cars + b_rental_return, MAX_CARS)
                                prob = prob_ * a_rental_return_prob * b_rental_return_prob
                    self.P[state][action].append([prob, [next_a, next_b], reward, False])                                          
            return self.P[state][action]

In [8]:
def policy_evaluation(env, discount_factor=0.9, theta=1e-2):    
    V = np.zeros([MAX_CARS + 1, MAX_CARS + 1])
    policy = np.zeros([MAX_CARS + 1, MAX_CARS + 1])
    while True:
        delta = 0
        for a_s in range(V.shape[0]):
            for b_s in range(V.shape[1]):
                v = V[a_s][b_s]
                for action in actions:
                    if action < 0 and abs(action) <= b_s:
                        cases = env.step((a_s, b_s), action)                        
                    elif action >= 0 and abs(action) <= a_s:
                        cases = env.step((a_s, b_s), action)
                    else:
                        continue
                    for (prob, next_state, reward, done) in cases:
                        V[a_s][b_s] += prob * (reward + discount_factor * V[next_state[0] - 1][next_state[1] - 1])               
                delta = max(delta, np.abs(v - V[a_s][b_s]))
        print(delta)
        if delta < theta:
            break                        
    return V

In [9]:
env = CarRental()
value = policy_evaluation(env)

0,2.54694375092e+12
1,5.4454432225e+14
2,7.86316766063e+16
3,1.0088952383e+19
4,1.24978812158e+21
5,1.53337850233e+23
6,1.87672570347e+25
7,2.29560089196e+27
8,2.80758166943e+29
9,3.43364066332e+31
10,4.19927427212e+33
11,5.13562093294e+35
12,6.28075032496e+37
13,7.68121754961e+39
14,9.39395756926e+41
15,1.14886003317e+44
16,1.40503016436e+46
17,1.71832051375e+48
18,

KeyboardInterrupt: 

In [None]:
def policy_iteration(env, discount_factor)
    # policy improvement
        policy_stable = True
        for a_s in range(V.shape[0]):
            for b_s in range(V.shape[1]):
                v = V[a_s][b_s]
                for action in policy:
                    if action < 0 and abs(action) <= b_s:
                        cases = env.step([a_s, b_s], action)                        
                    elif action >= 0 and abs(action) <= a_s:
                        cases = env.step([a_s, b_s], action)
                    else:
                        continue
                    for (prob, next_state, reward, done) in cases:
                        V[a_s][b_s] += prob * (reward + discount_factor * V[next_state[0] - 1][next_state[1] - 1])   
                policy[]
    return V
    # action range: -5 ~ +5

In [198]:
env = CarRental()
value = policy_iteration(env)

KeyboardInterrupt: 