In [None]:
import sys
sys.path.insert(0, "/Users/xinran.he/GitProjects/mahjong")

In [None]:
from mahjong.shanten import Shanten
import numpy as np
import random
import tensorflow as tf

SHANTEN = Shanten()

In [None]:
from single_efficiency.model import Model
from single_efficiency import utils

In [None]:
PARAMS = {
    # Model params
    "initializer_gain": 1.0,  # Used in trainable variable initialization.
    "hidden_size": 16, # Model dimension in the hidden layers, input embedding dimension
    "num_hidden_layers": 2, # Number of layers in the encoder stacks.
    "num_heads": 2,
    "filter_size": 16,
    
    # reward
    "gamma": 0.9,
    
    # Dropout values (only used when training)
    "layer_postprocess_dropout": 0.1,
    "attention_dropout": 0.1,
    "relu_dropout": 0.1,
    
    # Params for transformer TPU
    "allow_ffn_pad": True,
    
    # training
    "batch_size": 128,
    "learning_rate": 0.001,

    "optimizer_adam_beta1": 0.9,
    "optimizer_adam_beta2": 0.997,
    "optimizer_adam_epsilon": 1e-09,    
}

In [None]:
class RLStrategy(object):
    def __init__(self, sess, model, epsilon, is_debug=False):
        self.epsilon = epsilon
        self.model = model
        self.sess = sess
        self.is_debug = is_debug
    
    def discard(self, tiles):
        hands = utils.tiles34_to_list(tiles)
        if np.random.uniform() <= self.epsilon:
            return random.choice(hands)
        else:
            predicts = model.inference(sess, hands)
            if self.is_debug:
                for t, v in zip(hands, predicts):
                    print "%s:%.3f" % (utils.TO_GRAPH_LIST[t], v)
            idx = np.argmax(predicts)
            return hands[idx]

In [None]:
hands = utils.load_hand(["/Users/xinran.he/GitProjects/mahjong/data/single_hand_efficiency/20180102.txt"])
print len(hands)

In [None]:
MAX_ROUND = 60
MEMORY_SIZE = 1000000
NEGATIVE_SAMPLE_RATE = 0.2
NUM_STEP_EPOCH = 10000

INIT_EPSILON = 0.5
EPSILON_DECAY_PER_EPOCH = 0.9

NEGATIVE_SAMPLE_RATE = 0.2
MIN_NUM_INSTANCES = 1000

## Model initialization

In [None]:
memory = []
memory_pos = 0
epsilon = INIT_EPSILON

g=tf.Graph()
with g.as_default():
    sess = tf.Session()
        
    # initialize model
    model = Model(PARAMS)
    model.init(sess)

In [None]:
NUM_EPOCHS = 50
with g.as_default():
    for epoch in xrange(NUM_EPOCHS):
        print "EPOCH: %d" % epoch
        # update target network
        model.update_target_network(sess)
        strategy = RLStrategy(sess, model, epsilon)

        for step in xrange(NUM_STEP_EPOCH):
            # random sample hand
            init_hand = random.choice(hands)[1]
            current_hand = [0] * utils.NUM_HAIS
            left_tiles = [4] * utils.NUM_HAIS
            for hai in init_hand:
                left_tiles[hai] -= 1
                current_hand[hai] += 1
            yama = utils.tiles34_to_list(left_tiles)
            random.shuffle(yama)

            for i in xrange(MAX_ROUND):
                state = utils.tiles34_to_list(current_hand)
                shanten = int(SHANTEN.calculate_shanten(current_hand))

                # call epsilon-greedy startegy to find action
                discard = strategy.discard(current_hand)
                discard_index = state.index(discard)

                # draw a new tile
                current_hand[discard] -= 1
                new_tile = yama[i]
                current_hand[new_tile] += 1

                state_plus = utils.tiles34_to_list(current_hand)
                shanten_plus = int(SHANTEN.calculate_shanten(current_hand))

                if utils.is_agari(current_hand):
                    is_terminal = 1
                    reward = utils.get_total_score(current_hand, new_tile) / 100.0
                else:
                    is_terminal = 0
                    reward = shanten - shanten_plus

                if reward > 0 or np.random.uniform() <= NEGATIVE_SAMPLE_RATE:
                    if len(memory) < MEMORY_SIZE:
                        memory.append((state, state_plus, reward, discard_index, is_terminal, shanten, shanten_plus))
                    else:
                        memory[memory_pos] = (state, state_plus, reward, discard_index, is_terminal, shanten, shanten_plus)
                        memory_pos = (memory_pos + 1) % MEMORY_SIZE

                # agari is end of episode
                if is_terminal > 0:
                    break

            if len(memory) >= MIN_NUM_INSTANCES:
                # sample one batch from replay memory
                batch = random.sample(memory, PARAMS["batch_size"])
                state_batch = np.array([b[0] for b in batch], dtype=np.int32)
                state_plus_batch = np.array([b[1] for b in batch], dtype=np.int32)
                reward_batch = np.array([b[2] for b in batch], dtype=np.float32)
                action_batch = np.array([b[3] for b in batch], dtype=np.int32)
                terminal_batch = np.array([b[4] for b in batch], dtype=np.int32)
                loss = model.train(sess, (state_batch, state_plus_batch, reward_batch, action_batch, terminal_batch))
                if step % 100 == 0:
                    print "step %d loss %.3f" % (step, loss)
            else:
                if step % 100 == 0:
                    print "step %d memory size: %d" % (step, len(memory))

        # update epsilon
        epsilon = epsilon * EPSILON_DECAY_PER_EPOCH

In [None]:
def evaluate_strategy(sess, model, hands):
    Qvalues = np.zeros(len(hands))
    rewards = np.zeros(len(hands))
    strategy = RLStrategy(sess, model, 0.01)
    print "Eval:"
    for i, hand in enumerate(hands):
        if i % 50 == 0:
            print i,
        Qvalues[i] = np.max(model.inference(sess, hand))
        
        current_hand = [0] * utils.NUM_HAIS
        left_tiles = [4] * utils.NUM_HAIS
        for hai in hand:
            left_tiles[hai] -= 1
            current_hand[hai] += 1
        yama = utils.tiles34_to_list(left_tiles)
        random.shuffle(yama)
        discount = 1.0
        for r in xrange(MAX_ROUND):
            state = utils.tiles34_to_list(current_hand)
            discard = strategy.discard(current_hand)
            # discard and draw a new tile
            current_hand[discard] -= 1
            new_tile = yama[r]
            current_hand[new_tile] += 1
            
            if utils.is_agari(current_hand):
                rewards[i] = discount * utils.get_total_score(current_hand, new_tile) / 100.0
    print "...Done"
    return np.mean(Qvalues), np.mean(rewards)

# Debug

In [None]:
def show_prediction(sess, hand):
    predictions = model.inference(sess, hand)
    for hid, prediction in zip(hand, predictions):
        print utils.TO_GRAPH_LIST[hid] + ":" + str(prediction)

In [None]:
show_prediction(sess, hands[158][1])