In [306]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

import gymnasium as gym

In [307]:
def evalutae_model(model, N):
    env = gym.make("Blackjack-v1")
    rewards = 0 
    wins = 0
    for i in np.arange(N):
        observation, info = env.reset()
        while True:
            p = model(np.array([observation]))
            action = p.numpy()[0].argmax()
            # print(p, action)
            # action = np.random.choice(2)
            observation, reward, terminated, truncated, info = env.step(action)
            if terminated or truncated:
                rewards += reward
                if reward > 0:
                    wins += 1
                break
    env.close()
    return (rewards, wins)

def evalutae_random_action(N):
    env = gym.make("Blackjack-v1")
    rewards = 0 
    wins = 0
    for i in np.arange(N):
        observation, info = env.reset()
        while True:
            # p = model(np.array([observation]))
            # action = p.numpy()[0].argmax()
            # print(p, action)
            action = np.random.choice(2)
            observation, reward, terminated, truncated, info = env.step(action)
            if terminated or truncated:
                rewards += reward
                if reward > 0:
                    wins += 1
                break
    env.close()
    return (rewards, wins)

def create_model():
    input_shape = (3,) 
    inputs = Input(shape=input_shape)
    x = Dense(32, activation='relu')(inputs)
    x = Dense(32, activation='relu')(x)
    outputs = Dense(2, activation='softmax')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

def one_episode(model):
    env = gym.make("Blackjack-v1")
    observation, info = env.reset()
    record_input = []
    record_p = []
    record_action = []
    n_record = 0
    terminated, truncated = False, False
    while True:
        n_record += 1
        
        record_input.append(observation)
        
        p = model(np.array([observation]))
        p = p.numpy()[0]
        p[-1] = 1 - np.sum(p[0:-1])
        action = np.random.choice(2, p=p)
        record_action.append(action)
        
        observation, reward, terminated, truncated, info = env.step(action)
        if terminated or truncated:
            for i in np.arange(n_record):
                record_p.append([0,0])
                record_p[i][record_action[i]] = reward
            break
    # print(record_input, records_p, records_v)
    env.close()
    return record_input, record_p
    

def one_epoch(model, N_episode):
    records_input, records_p = [], []
    for i in np.arange(N_episode):
        record_input, record_p = one_episode(model)
        records_input = records_input + record_input
        records_p = records_p + record_p
    model.fit(np.array(records_input), np.array(records_p))


In [370]:
model = create_model()

In [395]:
for i in np.arange(1):
    one_epoch(model, 20)
    print(outputs_of_random_inputs(model))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.2140 - loss: -0.1598
28


In [396]:
evalutae_model(model, 10000)

(-1207.0, 4046)

In [375]:
evalutae_model(model, 10000)

(-1771.0, 3875)

In [292]:
evalutae_model(model, 10000)

(-9348.0, 326)

In [362]:
evalutae_model(model, 1000)

(-992.0, 4)

In [274]:
evalutae_random_action(10000)

(-3990.0, 2783)

In [37]:
p, v = model(np.array([observation]))

In [351]:
model.weights[0][0]

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([-0.15967727, -0.20917146,  0.3257222 ,  0.42648348, -0.29053485,
        0.08969045,  0.56973636,  0.25477907, -0.03239185,  0.52598864,
       -0.18722728, -0.25223535,  0.6074904 , -0.11595401,  0.3891839 ,
        0.27467036,  0.2365367 ,  0.11993187, -0.12269354, -0.17418747,
        0.01585588,  0.30067694,  0.39351732,  0.22168852, -0.23408744,
       -0.28711814,  0.26008224, -0.08931121,  0.4398663 ,  0.60695106,
        0.28662264,  0.6733576 ], dtype=float32)>

In [312]:
def evalutae_model_2(model, N):
    env = gym.make("Blackjack-v1")
    rewards = 0 
    wins = 0
    for i in np.arange(N):
        observation, info = env.reset()
        while True:
            p = model(np.array([observation]))
            action = p.numpy()[0].argmax()
            print(observation, p.numpy(), action)
            # action = np.random.choice(2)
            observation, reward, terminated, truncated, info = env.step(action)
            if terminated or truncated:
                rewards += reward
                if reward > 0:
                    wins += 1
                break
    env.close()
    return (rewards, wins)

In [315]:
evalutae_model_2(model, 1)

(10, 4, 0) [[0.75036204 0.2496379 ]] 0


(1.0, 1)

In [366]:
def outputs_of_random_inputs(model):
    s = 0
    for i in np.arange(100):
        env = gym.make("Blackjack-v1")
        observation, info = env.reset()
        p = model(np.array([observation]))
        action = p.numpy()[0].argmax()
        s = s + action
        # print(p.numpy()[0], action)
        env.close()
    return s

In [371]:
outputs_of_random_inputs(model)

np.int64(100)

---