In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from invest_game import state_set, action_set, num_states, num_actions, \
    initial_state, take_action_by_human, environment_response

In [2]:
print('State set:', state_set) 
print('Action set:', action_set) # 'b', 'h', and 's' means buy, hold, sell, respectively

State set: ['AMZN', 'GOOG', 'FB']
Action set: ['b', 'h', 's']


In [3]:
def print_and_log_string(file_handle, string):
    print(string)
    file_handle.write(string)
    

In [4]:
episode = 0

In [10]:
print('Please type in anything in the box.')

is_first_episode = (episode == 0)
if is_first_episode:
    log_file_handle = open('playing_history_human.txt', 'w')
else:
    log_file_handle = open('playing_history_human.txt', 'a')
    

s = initial_state()
T = 10 # total steps in each episode
t = 1 # time step
G = 0 # cumulative gain

string_print_log = 'Episode %d starts, Your initial state: %s.\n' % (episode,s)
print_and_log_string(file_handle=log_file_handle, string=string_print_log)

while True:
    
    a = take_action_by_human()
    
    s_, r_ = environment_response(s, a)
    string_print_log = 'Time: %d; \nAction: %s; \nNew state: %s; \nReward: %d k€.\n=======\n' % (t, a, s_, r_)
    print_and_log_string(file_handle=log_file_handle, string=string_print_log)
    
    G += r_ # add the reward to cumulative gain
    t += 1 # increase time step by 1
    s = s_ # replace old state with new state
    
    end_of_episode = (t==T)
    if end_of_episode:
        string_print_log = 'End of episode, your total gain is %d k€ in this episode.\n========================\n' % G
        print_and_log_string(file_handle=log_file_handle, string=string_print_log)
        break

episode += 1 # episode increase by 1
log_file_handle.close() # close log file handle

Please type in anything in the box.
Episode 3 starts, Your initial state: AMZN.

s
Time: 1; 
Action: s; 
New state: FB; 
Reward: 2 k€.

s
Time: 2; 
Action: s; 
New state: AMZN; 
Reward: 13 k€.

s
Time: 3; 
Action: s; 
New state: GOOG; 
Reward: -21 k€.

s
Time: 4; 
Action: s; 
New state: AMZN; 
Reward: 11 k€.

s
Time: 5; 
Action: s; 
New state: FB; 
Reward: 6 k€.

s
Time: 6; 
Action: s; 
New state: AMZN; 
Reward: 12 k€.

s
Time: 7; 
Action: s; 
New state: FB; 
Reward: 1 k€.

s
Time: 8; 
Action: s; 
New state: AMZN; 
Reward: 17 k€.

s
Time: 9; 
Action: s; 
New state: AMZN; 
Reward: 17 k€.

End of episode, your total gain is 60 k€ in this episode.



# State-action value

In [7]:
s2i = {state:index for index, state in enumerate(state_set)} # state to index
a2i = {action:index for index, action in enumerate(action_set)} # action to index

def Q(s, a):
    return Q_MAT[s2i[s], a2i[a]]

def target(reward, state_new, gamma):
        
    return reward + gamma*np.max([Q(state_new, a) for a in action_set]) 



### Q-learning
$$
Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \eta [R_{t+1}+\gamma \underset{a}{\mathrm{max}}Q(S_{t+1}, a) - Q(S_t, A_t)]
$$

In [11]:
Q_MAT = 0*np.ones(shape=(num_states, num_actions))

eta = 0.001 # learning rate
gamma = 0.1 # discount factor          
            
K = 10000 

def update_Q():
    for s in state_set:
        for a in action_set:
            s_, r_ = environment_response(s, a) # next state, and reward
                        
            delta = target(reward=r_, state_new=s_, gamma=gamma) - Q(s, a)
            Q_MAT[s2i[s], a2i[a]] += eta*delta  
            
            

Q_MAT_old = np.copy(Q_MAT)   
for k in range(K):
        
    update_Q()
    
    if k%1000==0:
        print(k, mean_squared_error(Q_MAT_old, Q_MAT))
        Q_MAT_old = np.copy(Q_MAT)  
        
Q_df = pd.DataFrame(data=Q_MAT, index=state_set, columns=action_set)
Q_df

0 0.0001921373589595849
1000 12.132949657955791
2000 1.3837716078744935
3000 0.42910116741852017
4000 0.2342436739173133
5000 0.0640621107768592
6000 0.07612769947884564
7000 0.06422929738968967
8000 0.06355765757065003
9000 0.16153718422624333


Unnamed: 0,b,h,s
AMZN,6.692431,0.758488,2.787916
GOOG,9.296426,0.822899,7.49665
FB,6.19476,0.841189,8.096299


### SARSA


In [12]:
Q_MAT = 10*np.ones(shape=(num_states, num_actions))

eta = 0.01 # learning rate
gamma = 0.1 # discount factor          
epsilon = 0.1 # greedy factor            
K = 100000 

def epsilon_greedy(array, epsilon=0.1):
    i_greedy = np.argmax(array)
    a_greedy = action_set[i_greedy]
    
    a_random = np.random.choice(action_set, size=None)
    
    return np.random.choice([a_greedy, a_random], size=None, p=[1-epsilon, epsilon])
    
s = initial_state()
Q_MAT_old = np.copy(Q_MAT) 
for k in range(K):
    a = epsilon_greedy(array=Q_MAT[s2i[s],:], epsilon=epsilon)
    s_, r_ = environment_response(s, a) # next state, and reward
    a_ = epsilon_greedy(array=Q_MAT[s2i[s_],:], epsilon=epsilon)
    
    delta = [r_ + gamma*Q(s_, a_)] - Q(s, a)
    Q_MAT[s2i[s], a2i[a]] += eta*delta 
    s, a = s_, a_
    
    if k%1000==0:
        print(k, mean_squared_error(Q_MAT_old, Q_MAT))
        Q_MAT_old = np.copy(Q_MAT) 

    
Q_df = pd.DataFrame(data=Q_MAT, index=state_set, columns=action_set)
Q_df

0 1.8526229436447107e-05
1000 7.561883592645782
2000 1.8883256571290374
3000 1.0115330311522712
4000 3.4940038924186534
5000 0.4467531872815534
6000 0.19188370456966466
7000 0.44839125104283895
8000 0.6539100096037614
9000 0.5175693174021309
10000 1.386717360378116
11000 1.6591478478663906
12000 1.2819031099592897
13000 2.563555942256475
14000 1.359401311685213
15000 2.7093281456117215
16000 3.124438448708341
17000 0.4476761067412782
18000 0.25322458793790675
19000 0.35907257712067303
20000 0.2920164074198029
21000 0.24359031560195973
22000 0.2660640609772579
23000 1.0097962107487188
24000 0.3901025684273906
25000 0.6218227806843146
26000 0.3512217868995186
27000 1.3577337631390225
28000 0.3732546406827435
29000 1.3284817276048053
30000 2.5613186224020157
31000 0.39386265587152564
32000 0.428659959085493
33000 0.9222044855600036
34000 0.2580940197395335
35000 1.0384899229618154
36000 0.18989530333920013
37000 0.6242691290988632
38000 0.7120916171583694
39000 0.7815995832378916
40000 0.

Unnamed: 0,b,h,s
AMZN,7.223628,0.706386,2.873535
GOOG,7.83805,0.775124,5.364514
FB,6.238374,0.785687,7.556711


In [13]:
Q_df

Unnamed: 0,b,h,s
AMZN,7.223628,0.706386,2.873535
GOOG,7.83805,0.775124,5.364514
FB,6.238374,0.785687,7.556711
