In [None]:
from agents.agent_random import Player as RandomPlayer
from agents.agent_consider_equity import Player as EquityPlayer
import logging
# from agents.agent_keras_rl_dqn import Player as DQNPlayer

import gym
import numpy as np
import pandas as pd
from docopt import docopt

from gym_env.env import PlayerShell
# from tools.helper import get_config
# from tools.helper import init_logger

In [None]:
class Trainer:
    def __init__(self, num_episodes):
        self.env = None
        self.num_episodes = num_episodes

    def train(self, env, agent):
        self.env = env
        self.agent = agent
        self.agent.init_agent()
        self.agent.init_log()
        for episode in range(self.num_episodes):
            self.agent.init_episode()
            done = False
            while not done:
                action = self.agent.get_action()
                next_state, reward, done, _ = self.env.step(action)
                self.agent.update(action, next_state, reward, done)
            self.agent.update_log()
        self.agent.save_log()
        self.env.close()


env_name = 'neuron_poker-v0'
env = gym.make(env_name, initial_stacks=500, funds_plot=True, render=False,
               use_cpp_montecarlo=False)

env.add_player(RandomPlayer())
env.add_player(RandomPlayer())
env.add_player(RandomPlayer())
env.add_player(EquityPlayer(name='equity/50/70',
               min_call_equity=.5, min_bet_equity=.7))
env.add_player(EquityPlayer(name='equity/20/30',
               min_call_equity=.2, min_bet_equity=.3))
env.add_player(PlayerShell(name='pytorch', stack_size=500))

In [None]:
obs = env.reset()

In [None]:
from collections import deque
import random

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        state      = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
        
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def __len__(self):
        return len(self.buffer)

In [None]:
from matplotlib import pyplot as plt
import math

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
plt.plot([epsilon_by_frame(i) for i in range(10000)]);

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, num_inputs, num_actions):
        super(DQN, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(num_inputs, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, num_actions)
        )
        self.l1 = nn.Linear(num_inputs, 128)
        self.l2 = nn.Linear(128, 128)
        self.l3 = nn.Linear(128, num_actions)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = x.float()
        return self.layers(x)
    
    def act(self, state, epsilon, mask):
        if random.random() > epsilon:
            # state   = state.unsqueeze(0)
            state = torch.FloatTensor(state).to(device)
            q_value = self.forward(state)
            mask = torch.FloatTensor(mask).to(device)
            masked_q_values = q_value.masked_fill(mask == 0, float('-inf'))
        
            action = torch.argmax(masked_q_values).item()
        else:
            available_actions = np.where(mask > 0)[0]
            action = np.random.choice(available_actions)

        return action
# model = DQN(env.observation_space[0], env.action_space.n)
# model = model.to(device)
# model.act(state, epsilon, mask), mask

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DQN(env.observation_space[0], env.action_space.n)
model = model.to(device)
optimizer = optim.Adam(model.parameters())
replay_buffer = ReplayBuffer(10000)

In [None]:
def compute_td_loss(batch_size, device):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = torch.tensor(state).to(device)
    with torch.no_grad():
        next_state = torch.tensor(next_state).to(device)
    action     = torch.LongTensor(action).to(device)
    reward     = torch.FloatTensor(reward).to(device)
    done       = torch.FloatTensor(done).to(device)
    q_values      = model(state)
    next_q_values = model(next_state)
    
    q_value          = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    next_q_value     = next_q_values.max(1)[0]
    expected_q_value = reward + gamma * next_q_value * (1 - done)
    
    loss = (q_value - expected_q_value.data).pow(2).mean()
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss

In [None]:
# # # env.reset()
# # # env.step(1)
# # env.legal_moves[0], env.action_space
# # env.legal_moves = sorted(env.legal_moves, key=lambda x: x._value_)
mask = np.zeros(env.action_space.n)
values = [i._value_ for i in env.legal_moves]
for ind in range(env.action_space.n):
    mask[ind] = ind in values
    # print(ind, i._value_, mask[ind], values)
print(mask, env.legal_moves)
# # ans = env.step(0)
# # ans[0].shape

In [None]:
num_frames = 100000
batch_size = 64
gamma      = 0.99

losses = []
all_rewards = []
episode_reward = 0
step = 0
ep = 0
max_steps = 0

state = env.reset()
import tqdm
for frame_idx in tqdm.tqdm(range(1, num_frames + 1)):
    epsilon = epsilon_by_frame(frame_idx)
    # env.legal_moves = sorted(env.legal_moves, key=lambda x: x._value_)
    mask = np.zeros(env.action_space.n)
    values = [i._value_ for i in env.legal_moves]
    for ind in range(env.action_space.n):
        mask[ind] = ind in values

    action = model.act(state, epsilon, mask)
    try:
        next_state, reward, done, info = env.step(action)
    except Exception as e:
        print(action, mask)
        raise e
    
    replay_buffer.push(state, action, reward, next_state, float(done))
    
    state = next_state
    episode_reward += reward
    step += 1
    max_steps = max(step, max_steps)
    
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        step = 0
        ep += 1
        
    if len(replay_buffer) > batch_size:
        loss = compute_td_loss(batch_size, device)
        losses.append(loss.item())
env.a

In [None]:
available_actions = np.where(mask > 0)[0]
action = np.random.choice(available_actions)
action