In [1]:
import torch
import numpy as np

m = torch.Tensor([2.2,4.0],)
print(m)

tensor([2.2000, 4.0000])


In [2]:
## Initialize policy to be evaluated
class Agent():
    def __init__(self, gamma=0.99):
        #discount factor
        self.gamma = gamma
        #estimates of states' values
        self.V = {}
        #state/action spaces
        self.state_spaces = {
            #possible sums of cards
            "sum":[i for i in range(4,22)],
            #possible cards in dealer's hand
            "dealer_show_card":[i+1 for i in range(10)],
            #useable ace?
            "ace_eleven":[False,True],
            # #hit (or stay)?
            # "hit":[0,1]
        }
        #hit (or stay)?
        self.action_space = [0,1]
        #combinations of parameters
        self.states = []
        #returns
        self.rewards = {}
        #has agent visited state before?
        self.states_visited = {}
        #states already encountered/returns already received
        self.memory = []
        
        self.init_vals()
        
    def init_vals(self):
        for total in self.state_spaces["sum"]:
            for card in self.state_spaces["dealer_show_card"]:
                for ace in self.state_spaces["ace_eleven"]:
                    self.V[(total, card, ace)] = 0
                    self.rewards[(total, card, ace)] = []
                    self.states_visited[(total, card, ace)] = False
                    self.states.append((total, card, ace))
                    
    def policy(self, state):
        total, _, _ = state
        #stay if under 21, otherwise hit
        action = 0 if total >= 20 else 1
        return action
        
    def update_V(self):
        for idt, (state, _) in enumerate(self.memory):
            G = 0
            if not self.states_visited[state]:
                self.states_visited[state] = True
                #initialize discount factor, k, for gamma^k
                discount = 1
                
                for t, (_, reward) in enumerate(self.memory[idt:]):
                    G += reward * discount
                    discount *= self.gamma
                    self.rewards[state].append(G)
                    
        for state,_ in self.memory:
            self.V[state] = np.mean(self.rewards[state])
            
        for state in self.states:
            self.states_visited[state] = False
            
        self.memory = []

In [3]:
#main
import gym

In [4]:
episodes = 500000

env = gym.make('Blackjack-v1')
agent = Agent()

#traverse episodes
for i in range(episodes):
    if i % 50000 == 0:
        print("Starting episode {}...".format(i))
        
    #initialize state
    state_null = env.reset()
    done = False
    
    while not done:
        #get action, a, from state
        action = agent.policy(state_null)
        #get reward, q, from state action
        state_prime, reward, done, info = env.step(action)
        #store results
        agent.memory.append((state_null, reward))
        #move on to next state
        state_null = state_prime
        
    #update value function
    agent.update_V()

print(agent.V[(21, 3, True)])
print(agent.V[(4, 1, False)])

Starting episode 0...
Starting episode 50000...
Starting episode 100000...
Starting episode 150000...
Starting episode 200000...
Starting episode 250000...
Starting episode 300000...
Starting episode 350000...
Starting episode 400000...
Starting episode 450000...
0.9755244755244755
-0.19190970832904883
