# Kaitao Yang
#### Deep Learning Scientist @ eBay, Amsterdam
#### Deep Learning Lecturer @ [DL-APPLIED](www.dlapplied.com)
#### [Email](ykaitao@hotmail.com), [LinkedIn](https://www.linkedin.com/in/kaitaoyang/) 
#### Deep Learning Training, May 26, 27 (Saturday and Sunday), 2018, Utrecht, [Sign in ](https://dlapplied.com/deep-learning-training/) now to get <span style="color:blue"> 85% OFF</span>



In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from stock_game import state_set, action_set, num_states, num_actions, \
    initial_state, take_action_by_human, environment_response

In [5]:
print('State set:', state_set) 
print('Action set:', action_set) # 'b', 'h', and 's' means buy, hold, sell, respectively

State set: ['AMZN', 'GOOG', 'FB']
Action set: ['b', 'h', 's']


In [6]:
# Utils
def print_and_log_string(file_handle, string):
    print(string)
    file_handle.write(string)
    


In [7]:
episode = 0

In [20]:
# play by human
print('Please type in anything in the box.')

is_first_episode = (episode == 0)
if is_first_episode:
    log_file_handle = open('playing_history_human.txt', 'w')
else:
    log_file_handle = open('playing_history_human.txt', 'a')
    

s = initial_state()
T = 10 # total steps in each episode
t = 1 # time step
G = 0 # cumulative gain

string_print_log = 'Episode %d starts, Your initial state: %s.\n' % (episode,s)
print_and_log_string(file_handle=log_file_handle, string=string_print_log)

while True:
    
    a = take_action_by_human()
    
    s_, r_ = environment_response(s, a)
    string_print_log = 'Time: %d; \nAction: %s; \nNew state: %s; \nReward: %d k€.\n=======\n' % (t, a, s_, r_)
    print_and_log_string(file_handle=log_file_handle, string=string_print_log)
    
    G += r_ # add the reward to cumulative gain
    t += 1 # increase time step by 1
    s = s_ # replace old state with new state
    
    end_of_episode = (t==T)
    if end_of_episode:
        string_print_log = 'End of episode, your total gain is %d k€ in this episode.\n========================\n' % G
        print_and_log_string(file_handle=log_file_handle, string=string_print_log)
        break

episode += 1 # episode increase by 1
log_file_handle.close() # close log file handle

Please type in anything in the box.
Episode 5 starts, Your initial state: FB.

s
Time: 1; 
Action: s; 
New state: AMZN; 
Reward: 15 k€.

b
Time: 2; 
Action: b; 
New state: GOOG; 
Reward: 27 k€.

b
Time: 3; 
Action: b; 
New state: GOOG; 
Reward: 13 k€.

b
Time: 4; 
Action: b; 
New state: AMZN; 
Reward: -19 k€.

b
Time: 5; 
Action: b; 
New state: FB; 
Reward: -5 k€.

s
Time: 6; 
Action: s; 
New state: AMZN; 
Reward: 11 k€.

b
Time: 7; 
Action: b; 
New state: GOOG; 
Reward: 21 k€.

b
Time: 8; 
Action: b; 
New state: GOOG; 
Reward: 14 k€.

b
Time: 9; 
Action: b; 
New state: GOOG; 
Reward: 22 k€.

End of episode, your total gain is 101 k€ in this episode.



# State-action value

In [14]:
s2i = {state:index for index, state in enumerate(state_set)} # state to index
a2i = {action:index for index, action in enumerate(action_set)} # action to index

### Q-learning
$$
Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \eta [R_{t+1}+\gamma \underset{a}{\mathrm{max}}Q(S_{t+1}, a) - Q(S_t, A_t)]
$$

In [15]:
Q = 0*np.ones(shape=(num_states, num_actions))

eta = 0.001 # learning rate
gamma = 0.1 # discount factor                    
K = 10000 # number of iterations

def target(reward, state_new, gamma):
        
    return reward + gamma*np.max([Q[s2i[state_new], a2i[a]] for a in action_set]) 

def update_Q():
    for s in state_set:
        for a in action_set:
            s_, r_ = environment_response(s, a) # next state, and reward
                        
            delta = target(reward=r_, state_new=s_, gamma=gamma) - Q[s2i[s], a2i[a]]
            Q[s2i[s], a2i[a]] += eta*delta  
            
            
def q_learning():
    Q_old = np.copy(Q)   
    for k in range(K):

        update_Q()

        if k%1000==0:
            print(k, mean_squared_error(Q_old, Q))
            Q_old = np.copy(Q)  

            
q_learning()
Q_df = pd.DataFrame(data=Q, index=state_set, columns=action_set)
Q_df

0 0.00018776973291799315
1000 12.592684147341416
2000 1.7108837556130307
3000 0.7001453469170388
4000 0.11881937104012837
5000 0.10920629860220286
6000 0.07803189747068302
7000 0.03889490187353489
8000 0.06728121582035113
9000 0.12257936929766801


Unnamed: 0,b,h,s
AMZN,7.350263,0.791815,2.752873
GOOG,9.386443,0.810907,7.11033
FB,7.082428,0.824389,8.31585


### SARSA


In [19]:
Q_sarsa = 10*np.ones(shape=(num_states, num_actions))

eta = 0.01 # learning rate
gamma = 0.1 # discount factor 
K = 10000 # number of iterations
epsilon = 0.1 # greedy factor            

def epsilon_greedy(array, epsilon=0.1):
    
    a_greedy = action_set[np.argmax(array)]
    a_random = np.random.choice(action_set, size=None)
    
    return np.random.choice([a_greedy, a_random], size=None, p=[1-epsilon, epsilon])

def sarsa_learning():
    
    s = initial_state()
    Q_sarsa_old = np.copy(Q_sarsa) 
    
    for k in range(K):
        a = epsilon_greedy(array=Q_sarsa[s2i[s],:], epsilon=epsilon)
        s_, r_ = environment_response(s, a) # next state, and reward
        a_ = epsilon_greedy(array=Q_sarsa[s2i[s_],:], epsilon=epsilon)

        delta = [r_ + gamma*Q_sarsa[s2i[s_], a2i[a_]]] - Q_sarsa[s2i[s], a2i[a]]
        Q_sarsa[s2i[s], a2i[a]] += eta*delta 
        s, a = s_, a_

        if k%1000==0:
            print(k, mean_squared_error(Q_sarsa_old, Q_sarsa))
            Q_sarsa_old = np.copy(Q_sarsa) 

sarsa_learning()   
Q_df = pd.DataFrame(data=Q_sarsa, index=state_set, columns=action_set)
Q_df

0 0.006819244192929303
1000 8.012752591141865
2000 1.3479546129496613
3000 1.3042756653840073
4000 2.0474430557792114
5000 0.3515790986806268
6000 0.2502489918968549
7000 0.2821372429632747
8000 0.23469269878046287
9000 0.22528130722269338


Unnamed: 0,b,h,s
AMZN,6.639964,2.703175,4.341922
GOOG,9.313252,2.728952,7.538979
FB,6.952583,4.312501,9.168975


In [None]:
# page 189