In [None]:
import gym
import numpy as np

# 特徴量関数を定義する
def feature_fn(state, action):
    return np.array([state[0], state[1], state[2], state[3], action])

# 状態価値関数を定義する
def value_fn(state, theta):
    features = feature_fn(state, 0)
    value = np.dot(theta, features)
    return value

# 方策を定義する
def policy_fn(state, theta):
    left_value = value_fn(state, theta)
    right_value = value_fn(state, theta)
    if left_value > right_value:
        return 0
    else:
        return 1

# エピソードを実行する
def run_episode(env, theta):
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action = policy_fn(state, theta)
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        state = next_state
    return total_reward

# 重みベクトルを学習する
def learn_weight(env, num_episodes, learning_rate):
    theta = np.zeros(5)
    for episode in range(num_episodes):
        total_reward = run_episode(env, theta)
        features = np.zeros(5)
        state = env.reset()
        done = False
        while not done:
            action = policy_fn(state, theta)
            features += feature_fn(state, action)
            next_state, reward, done, info = env.step(action)
            state = next_state
        theta += learning_rate * (total_reward - np.dot(theta, features)) * features
    return theta

# 学習を実行する
env = gym.make('CartPole-v0')
theta = learn_weight(env, num_episodes=1000, learning_rate=0.01)
print(theta)
