In [None]:
import sys
sys.path.insert(0, "/Users/xinran.he/GitProjects/mahjong")

In [None]:
import numpy as np
import random
import tensorflow as tf

In [None]:
from single_efficiency.transformer import Transformer
from single_efficiency import utils, game_play, base_strategy

In [None]:
PARAMS = {
    # Model params
    "initializer_gain": 1.0,  # Used in trainable variable initialization.
    "hidden_size": 16, # Model dimension in the hidden layers, input embedding dimension
    "num_hidden_layers": 2, # Number of layers in the encoder stacks.
    "num_heads": 2,
    "filter_size": 16,
    
    # Dropout values (only used when training)
    "layer_postprocess_dropout": 0.1,
    "attention_dropout": 0.1,
    "relu_dropout": 0.1,
    
    # Params for transformer TPU
    "allow_ffn_pad": True,
    
    # training
    "learning_rate": 0.001,
    "learning_rate_warmup_steps": 16000,

    "optimizer_adam_beta1": 0.9,
    "optimizer_adam_beta2": 0.997,
    "optimizer_adam_epsilon": 1e-09,    
}

In [None]:
class Model(object):
    def __init__(self, params):
        self.params = params
        self.input_features = tf.placeholder(dtype=tf.int32, shape=(None, 13))
        self.input_values = tf.placeholder(dtype=tf.float32, shape=(None))
        self.transformer = Transformer(False, self.params)
        self.predictions = self.transformer(self.input_features)
        
        self.loss = tf.losses.mean_squared_error(self.input_values, self.predictions)
        optimizer = tf.contrib.opt.LazyAdamOptimizer(
            self.params["learning_rate"],
            beta1=self.params["optimizer_adam_beta1"],
            beta2=self.params["optimizer_adam_beta2"],
            epsilon=self.params["optimizer_adam_epsilon"])

        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(self.loss, tvars, colocate_gradients_with_ops=True)
        minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        
        self.train_op = tf.group(minimize_op, update_ops)
    
    def init(self, sess):
        init = tf.global_variables_initializer()
        sess.run(init)
    
    def train(self, sess, features, values):
        return sess.run([self.loss, self.train_op], feed_dict={self.input_features:features, self.input_values:values})[0]
    
    def inference(self, sess, features):
        return sess.run(self.predictions, feed_dict={self.input_features:features})

In [None]:
# model and training ops
g=tf.Graph()
with g.as_default():
    model = Model(PARAMS)
    sess = tf.Session()
    model.init(sess)

In [None]:
class RLStrategy(object):
    def __init__(self, sess, model, epsilon):
        self.epsilon = epsilon
        self.model = model
        self.sess = sess
    
    def discard(self, tiles, left_tiles):
        hands = []
        possible_tiles = []
        for i in xrange(34):
            if tiles[i] > 0:
                tiles[i] -= 1
                hands.append(utils.tiles34_to_list(tiles))
                possible_tiles.append(i)
                tiles[i] += 1
        if np.random.uniform() <= self.epsilon:
            return random.choice(possible_tiles)
        else:
            predicts = model.inference(sess, hands)
            for t, v in zip(possible_tiles, predicts):
                print "%s:%.3f" % (utils.TO_GRAPH_LIST[t], v)
            idx = np.argmax(predicts[:, 0])
            return possible_tiles[idx]

In [None]:
hands = utils.load_hand(["/Users/xinran.he/GitProjects/mahjong/data/single_hand_efficiency/20180101.txt"])

In [None]:
def sample_init_hands(all_hands, probs):
    random.shuffle(all_hands)
    hands = []
    for hand in all_hands:
        if np.random.uniform() <= probs[hand[0]]:
            hands.append(hand[1])
    return hands

In [None]:
# Init strategy from GreedyShanten
with g.as_default():
    sampled_hands = sample_init_hands(hands, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
    NUM_INIT = 10000
    p = 0
    for i in xrange(NUM_INIT):
        data = []
        for j in xrange(2):
            p = (p + 1) % len(sampled_hands)
            data.extend(game_play.simulate(sampled_hands[p], base_strategy.GreedyShanten()))
        if len(data) == 0:
            continue
        features = np.array([d[0] for d in data])
        values = np.array([d[1] for d in data])
        if i % 100 == 0:
            print "Step: %d avg loss: %.3f" % (i, model.train(sess, features, values) / len(values))

In [None]:
print sampled_hands[0]

In [None]:
print sampled_hands[0]

In [None]:
print hands[0]

In [None]:
data = game_play.simulate(sampled_hands[0], RLStrategy(sess, model, 0.0), True)

In [None]:
print data