In [32]:
#Modern Neural Network, adapted from https://github.com/Newmu/Theano-Tutorials

import numpy as np
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype = theano.config.floatX)

def rectify(X):
    return T.maximum(X, 0.)

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))


class MLP(object):
    
    #Simplified RMSprop
    def RMSprop(self, cost, weights, a = 0.001, rho = 0.9, eps = 1e-6):
        grads = T.grad(cost = cost, wrt = weights)
        updates = []
        
        for w, g in zip(weights, grads):
            acc = theano.shared(w.get_value()*0.)
            acc_new = rho * acc + (1 - rho) * g**2
            grad_scale = T.sqrt(acc_new + eps)
            g = g/grad_scale
            updates.append((acc, acc_new))
            updates.append((p, p - a*g))
        
        return updates    
    
    #Random dropout for noise
    def dropout(self, X, P = 0.):
        if P > 0:
            X *= srng.binomial(X.shape, p = (1-P), dtype = theano.config.floatX)
            X /= (1-P)
        
        return X
        
    #Two hidden layer model
    def model(self, X, w_h1, w_h2, w_out, P_drop_i, P_drop_h):
        X = dropout(X, P_drop_i)
        h1 = rectify(T.dot(X, w_h1))
        
        h1 = dropout(h1, P_drop_h)
        h2 = rectify(T.dot(h1, w_h2))
        
        h2 = dropout(h2, P_drop_h)
        py_x = T.dot(h2, w_out)
        
        return h1, h2, py_x
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.X = T.fmatrix()
        self.Y = T.fmatrix()
        
        #Init weights
        self.w_h1 = init_weights((n_in, n_h1))
        self.w_h2 = init_weights((n_h1, n_h2))
        self.w_out = init_weights((n_h2, n_out))
    
        #Init model
        self.noise_h, self.noise_h2, self.noise_py_x = \
        self.model(self.X, self.w_h1, self.w_h2, self.w_out, P_drop_i, P_drop_h)
        self.h1, self.h2, self.py_x = \
        self.model(self.X, self.w_h, self.w_h2, self.w_o, 0., 0.)
        self.y_x_class = T.argmax(self.py_x, axis=1)
        self.y_x_value = T.max(self.py_x, axis=1)
        
        
        #Cost and Updates
        self.cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
        self.weights = [self.w_h, self.w_h2, self.w_out]
        self.updates = self.RMSprop(self.cost, self.weights, lr=self.lr)
        
        #Compile to Theano functions
        self.train = theano.function(inputs = [self.X, self.Y],
                                     outputs = self.cost,
                                     updates = self.updates,
                                     allow_input_downcast = True)
        self.predict_value = theano.function(inputs = [self.X],
                                             outputs = self.y_x_value, 
                                             allow_input_downcast = True)
        self.predict_class = theano.function(inputs = [self.X],
                                             outputs = self.y_x_class,
                                             allow_input_downcast = True)




In [None]:
#Initialization



In [19]:
#Deep Q Learning
import random

def state_to_input(state, valid_tile):
    state_input = np.flatten(state)
    tile_input = np.zeros((19,))
    tile_input[valid_tile] = 1
    state_input = np.append(state_input, tile_input)
    return state_input

def output_to_action(output, valid_tile):
    tile_idx = output/100
    output = output % 100
    row = output/10
    col = output%10
    return (row, col), (valid_tile[tile_idx])

class DeepQ(object):
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.game = Game()
        self.nn = MLP(n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h)
        self.exp = []

    def learn(self, n_epoch, exp_len, time_len, eps, gamma):
        for e in xrange(n_epoch):
            
            #Start epoch
            score, state, valid_tile, terminal = self.game.start_game()
            
            for t in xrange(time_len):
                
                #Epsilon greedy
                if(random.random() < self.eps):
                    policy_output = random.randint(0,300)
                    a_t, tile_t = output_to_action(rand_policy_output, valid_tile)
                    
                else:
                    policy_output = self.nn.predict_class(state_to_input(state, valid_tile))
                    a_t, tile_t = output_to_action(on_policy_output, valid_tile)
                    
                score_t, state_t, valid_tile_t, terminal = self.game.do_action(a_t, tile_t)
                reward = (score_t - score)
                
                #Store Experience
                self.exp.append({s:state, 
                                 vtl:valid_tile, 
                                 at:a_t, 
                                 tl:tile, 
                                 rd:reward,
                                 sp:state_t, 
                                 vtlp:valid_tile_t, 
                                 tm:terminal})
                score = score_t, state = state_t, valid_tile = valid_tile
                
                #Experience Replay
                trX = [], trY = []
                if(len(self.exp) > self.exp_len):
                    startX = random.randint(0, exp_len - batch_size)
                    for xp in xrange(startX, startX + batch_size):
                        if(xp[tm] = True):
                            Q = reward
                        else:
                            Q = reward + \
                                gamma * self.nn.predict_value(state_to_input(xp[sp], xp[vtlp]))
                                trX.append(state_to_input(xp[s], xp[vtl]))
                                trY.apppend([????])
                
                trX = np.asarray(trX)
                trY = np.asarray(trY)
                self.nn.train(trX, trY)
                
                if(terminal == True):
                    break
    
    def play(self):
        pass
        

SyntaxError: invalid syntax (<ipython-input-19-f7eac9ab7032>, line 37)