# Every-visit MC prediction with blackjack game

In [31]:
import gym 
import pandas as pd
from collections import defaultdict

In [32]:
env = gym.make('Blackjack-v0')

### Defining a Policy

In [41]:
def policy(state):
    return 0 if state[0] > 19 else 1

In [42]:
state = env.reset()
print(state)

(17, 10, False)


In [43]:
print(policy(state))

1


### Generating an Episode

In [44]:
num_timestep = 100

In [45]:
def generate_episode(policy):
    
    # let's define a list called episode for storing the episode
    episode = []

    # initialize the state by reseting the environment
    state = env.reset()
    
    # then for each time step
    for i in range(num_timestep):
        
        # Select the action according to the given policy
        action = policy(state)
        
        # perform the action and store the next state information
        next_state, reward, done, info = env.step(action)
        
        # store the state, action, reward into our episode list
        episode.append((state, action, reward))
        
        # if the next state is final state then break the loop else update the next state to the current state
        if done:
            break
            
        state = next_state
        
    return episode

In [46]:
generate_episode(policy)

[((20, 10, False), 0, 1.0)]

### Computing the Value Function

In [47]:
total_return = defaultdict(float)
N = defaultdict(int)

In [48]:
num_iterations = 500000

In [19]:
check = [((5, 4, False), 1, 0.0), ((7, 4, False), 1, 0.0), ((12, 4, False), 1, 0.0), ((19, 4, False), 1, -1.0)]

In [20]:
x = zip(*check)
print(tuple(x))

(((5, 4, False), (7, 4, False), (12, 4, False), (19, 4, False)), (1, 1, 1, 1), (0.0, 0.0, 0.0, -1.0))


In [21]:
states, actions, rewards = zip(*check)
for t, state in enumerate(states):
    print(t, state)


0 (5, 4, False)
1 (7, 4, False)
2 (12, 4, False)
3 (19, 4, False)


In [22]:
states, actions, rewards = zip(*check)
for t, state in enumerate(states):
    R = (sum(rewards[t:]))
    print(R)
    

-1.0
-1.0
-1.0
-1.0


In [49]:
# then for every iteration
for i in range(num_iterations):
    
    # generate the episode using the given policy, that is, generate an episode using the policy
    # function we defined earlier
    episode = generate_episode(policy)
    
    # store all the states, actions, reward obtained from the episode
    states, actions, rewards = zip(*episode)
    
    # then for each step in the episode
    for t, state in enumerate(states):
        
        # compute the return R of the state as the sum of reward
        R = (sum(rewards[t:]))
        
        # update the total return of the state
        total_return[state] = total_return[state] + R
        
        # update the number of times the state is visited in the episode
        N[state] = N[state] + 1
        

In [50]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])

In [51]:
N = pd.DataFrame(N.items(), columns=['state', 'N'])

In [52]:
df = pd.merge(total_return, N, on='state')

In [53]:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(16, 3, False)",-3346.0,5119
1,"(21, 3, False)",3434.0,3850
2,"(9, 9, False)",-844.0,1606
3,"(19, 9, False)",-3905.0,5517
4,"(21, 9, False)",3632.0,3860
5,"(15, 6, True)",-187.0,538
6,"(13, 6, False)",-2731.0,4681
7,"(20, 6, False)",5237.0,7477
8,"(7, 1, False)",-597.0,990
9,"(11, 1, False)",-498.0,2238


In [54]:
df.tail(10)

Unnamed: 0,state,total_return,N
270,"(5, 4, False)",-253.0,493
271,"(15, 7, True)",-178.0,568
272,"(18, 1, True)",-454.0,812
273,"(14, 5, True)",-174.0,524
274,"(12, 5, True)",-25.0,228
275,"(5, 9, False)",-202.0,445
276,"(4, 7, False)",-103.0,217
277,"(12, 7, True)",-32.0,205
278,"(12, 9, True)",-34.0,233
279,"(12, 1, True)",-72.0,214


In [55]:
df['value'] = df['total_return']/df['N']

In [56]:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(16, 3, False)",-3346.0,5119,-0.653643
1,"(21, 3, False)",3434.0,3850,0.891948
2,"(9, 9, False)",-844.0,1606,-0.525529
3,"(19, 9, False)",-3905.0,5517,-0.707812
4,"(21, 9, False)",3632.0,3860,0.940933
5,"(15, 6, True)",-187.0,538,-0.347584
6,"(13, 6, False)",-2731.0,4681,-0.583422
7,"(20, 6, False)",5237.0,7477,0.700415
8,"(7, 1, False)",-597.0,990,-0.60303
9,"(11, 1, False)",-498.0,2238,-0.22252


In [59]:
df[df['state']==(21, 9, False)]['value'].values

array([0.94093264])

In [60]:
df[df['state'] == (7, 1, False)]['value'].values

array([-0.6030303])

# First-visit MC Prediction with BlackJack Game

In [1]:
import gym 
import pandas as pd
from collections import defaultdict

In [2]:
env = gym.make('Blackjack-v0')

### Defining a Policy

In [3]:
def policy(state):
    return 0 if state[0] > 19 else 1

In [4]:
state = env.reset()

In [5]:
print(state)

(14, 1, True)


In [6]:
print(policy(state))

1


### Generating an Episode

In [7]:
num_timestep = 100

In [8]:
def generate_episode(policy):
    
    # Let's define a list called episode for storing the episode
    episode = []
    
    # Initialize the state by resetting the environment
    state = env.reset()
    
    # Then for each time step
    for i in range(num_timestep):
        
        # Select the action according to the given policy
        action = policy(state)
        
        # Perform the action and store the next state information
        next_state, reward, done, info = env.step(action)
        
        # Store the state action reward into our episode list
        episode.append((state, action, reward))
        
        # If the next state is a final state then break the loop else update the next state to the current state
        if done:
            break
            
        state = next_state
        
    return episode

In [9]:
generate_episode(policy)

[((15, 7, False), 1, -1.0)]

### Computing the Value Function

In [10]:
total_return = defaultdict(float)
N = defaultdict(int)

In [11]:
num_iterations = 10000

In [12]:
# Then for every iteration
for i in range(num_iterations):
    
    # Generate the episode using the given policy, that is, generate an episode using the policy
    # function we defined earlier
    episode = generate_episode(policy)
    
    # Store all the states, actions, rewards obtained from the rewards
    states, actions, rewards = zip(*episode)
    
    # Then, for each step in the episode
    for t, state in enumerate(states):
        
        # if the state is not visited already
        if state not in states[0:t]:
            
            # Compute the return R of the state as the sum of reward
            R = (sum(rewards[t:]))
            
            # Update the total return of the state
            total_return[state] = total_return[state] + R
            
            # Update the number of times the state is visited in the episode
            N[state] = N[state] + 1
            
            

In [14]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])

In [15]:
N = pd.DataFrame(N.items(), columns=['state', 'N'])

In [16]:
df = pd.merge(total_return, N, on='state')

In [17]:
df.head(10)

Unnamed: 0,state,total_return,N
0,"(18, 1, False)",-80.0,99
1,"(12, 10, False)",-207.0,352
2,"(14, 10, False)",-254.0,407
3,"(20, 10, False)",224.0,564
4,"(19, 4, True)",-9.0,14
5,"(13, 4, False)",-50.0,100
6,"(17, 10, False)",-333.0,458
7,"(11, 2, False)",-12.0,47
8,"(12, 2, False)",-36.0,82
9,"(19, 2, False)",-99.0,117


In [18]:
df['value'] = df['total_return'] / df['N']

In [19]:
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(18, 1, False)",-80.0,99,-0.808081
1,"(12, 10, False)",-207.0,352,-0.588068
2,"(14, 10, False)",-254.0,407,-0.624079
3,"(20, 10, False)",224.0,564,0.397163
4,"(19, 4, True)",-9.0,14,-0.642857
5,"(13, 4, False)",-50.0,100,-0.5
6,"(17, 10, False)",-333.0,458,-0.727074
7,"(11, 2, False)",-12.0,47,-0.255319
8,"(12, 2, False)",-36.0,82,-0.439024
9,"(19, 2, False)",-99.0,117,-0.846154


In [20]:
df[df['state'] == (20, 10, False)]['value'].values

array([0.39716312])

In [21]:
df[df['value'] > .90]

Unnamed: 0,state,total_return,N,value
43,"(21, 7, True)",51.0,54,0.944444
62,"(21, 8, False)",78.0,84,0.928571
76,"(21, 6, True)",41.0,45,0.911111
92,"(21, 4, False)",69.0,73,0.945205
121,"(21, 5, False)",64.0,70,0.914286
143,"(21, 3, False)",67.0,73,0.917808
150,"(21, 7, False)",58.0,62,0.935484
160,"(21, 8, True)",55.0,58,0.948276
178,"(20, 9, True)",11.0,11,1.0
182,"(21, 9, True)",45.0,49,0.918367


In [22]:
df[df['value'] < -0.5]

Unnamed: 0,state,total_return,N,value
0,"(18, 1, False)",-80.0,99,-0.808081
1,"(12, 10, False)",-207.0,352,-0.588068
2,"(14, 10, False)",-254.0,407,-0.624079
4,"(19, 4, True)",-9.0,14,-0.642857
6,"(17, 10, False)",-333.0,458,-0.727074
...,...,...,...,...
261,"(7, 7, False)",-13.0,15,-0.866667
262,"(17, 7, True)",-10.0,14,-0.714286
268,"(5, 2, False)",-4.0,5,-0.800000
272,"(4, 10, False)",-7.0,13,-0.538462


In [23]:
df[df['state'] == (12, 2, True)]['value'].values

array([-1.])

# Implementing on-Policy Monte Carlo Control

In [1]:
import gym
import pandas as pd
from collections import defaultdict
import random

In [18]:
env = gym.make('Blackjack-v0')

In [19]:
Q = defaultdict(float)

In [20]:
total_return = defaultdict(float)

In [21]:
N = defaultdict(int)

### Define the Epsilon Greedy Policy

In [22]:
def epsilon_greedy_policy(state, Q):
    
    # Set the epsilon value to 0.5
    epsilon = 0.5
    
    # Sample a random value from the uniform distribution. If sample value is less than 
    # epsilon then we select a random action else we select the best action which has maximum Q
    # value as shown below
    
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])

### Generating an Episode

In [23]:
num_timesteps = 100

In [24]:
def generate_episode(Q):
    
    # initialize a list for storing the episode
    episode = []
    
    # initialize the state using the reset function
    state = env.reset()
    
    # then for each time step
    for t in range(num_timesteps):
        
        # select the action according to the epsilon-greedy policy
        action = epsilon_greedy_policy(state, Q)
        
        # perform the selected action and store the next state information
        next_state, reward, done, info = env.step(action)
        
        # store the state, action, reward in the episode list
        episode.append((state, action, reward))
        
        # If the next state is a final state then break the loop else update the next state to the current
        # state 
        if done:
            break
            
        state = next_state
        
    return episode

### Computing Optimal Policy

In [25]:
num_iterations = 50000

In [26]:
# for each iteration
for i in range(num_iterations):
    
    # so, here we passed our initialized Q function to generate an episode
    episode = generate_episode(Q)
    
    # get all the state-pair actions in episode 
    all_state_action_pairs = [(s, a) for (s, a, r) in episode]
    
    # store all the rewards obtained in the episode in the rewards list
    rewards = [r for (s, a, r) in episode]
    
    # for each step in the episode
    for t, (state, action, reward) in enumerate(episode):
        
        # if the state-action pair is occuring for the first time in the episode
        if not (state, action) in all_state_action_pairs[0:t]:
            
            # compute the return R of the state-action pair as the sum of rewards
            R = sum(rewards[t:])
            
            # update total return of the state-action pair
            total_return[(state, action)] = total_return[(state, action)] + R
            
            # update the number of times the state-action pair is visited
            N[(state, action)] = N[(state, action)] + 1
            
            # compute the Q value by just taking the average
            Q[(state, action)] = total_return[(state, action)] / N[(state, action)]
            
            
    
    

In [27]:
df = pd.DataFrame(Q.items(), columns=['state_action_pair', 'value'])

In [28]:
df.head(11)

Unnamed: 0,state_action_pair,value
0,"((20, 10, False), 1)",-0.863366
1,"((11, 10, False), 0)",-0.655172
2,"((11, 10, False), 1)",-0.173077
3,"((13, 6, False), 0)",-0.126667
4,"((13, 6, False), 1)",-0.478261
5,"((16, 5, False), 0)",-0.106796
6,"((16, 5, False), 1)",-0.516484
7,"((19, 9, False), 0)",0.325581
8,"((19, 9, False), 1)",-0.766667
9,"((17, 10, False), 1)",-0.679487


In [29]:
df[df['state_action_pair'] == ((21, 8, True), 0)]

Unnamed: 0,state_action_pair,value
291,"((21, 8, True), 0)",0.891304


In [30]:
df[df['state_action_pair'] == ((21, 8, True), 1)]

Unnamed: 0,state_action_pair,value
292,"((21, 8, True), 1)",-0.148936
