In [94]:
#Modern Neural Network, adapted from https://github.com/Newmu/Theano-Tutorials

import numpy as np
import theano
import random
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype = theano.config.floatX)

def rectify(X):
    return T.maximum(X, 0.)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))


class MLP(object):
    
    #Simplified RMSprop
    def RMSprop(self, cost, weights, lr = 0.001, rho = 0.9, eps = 1e-6):
        grads = T.grad(cost = cost, wrt = weights)
        updates = []
        
        for w, g in zip(weights, grads):
            acc = theano.shared(w.get_value()*0.)
            acc_new = rho * acc + (1 - rho) * g**2
            grad_scale = T.sqrt(acc_new + eps)
            g = g/grad_scale
            updates.append((acc, acc_new))
            updates.append((w, w - lr*g))
        
        return updates    
    
    #Random dropout for noise
    def dropout(self, X, P = 0.):
        if P > 0:
            X *= srng.binomial(X.shape, p = (1-P), dtype = theano.config.floatX)
            X /= (1-P)
        
        return X
        
    #Two hidden layer model
    def model(self, X, w_h1, w_h2, w_out, P_drop_i, P_drop_h):
        X = self.dropout(X, P_drop_i)
        h1 = rectify(T.dot(X, w_h1))
        
        h1 = self.dropout(h1, P_drop_h)
        h2 = rectify(T.dot(h1, w_h2))
        
        h2 = self.dropout(h2, P_drop_h)
        py_x = T.dot(h2, w_out)
        
        return h1, h2, py_x
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.X = T.fmatrix()
        self.Y = T.fmatrix()
        
        #Init weights
        self.w_h1 = init_weights((n_in, n_h1))
        self.w_h2 = init_weights((n_h1, n_h2))
        self.w_out = init_weights((n_h2, n_out))
        print "Initializing Weights"
        #Init model
        self.noise_h, self.noise_h2, self.noise_py_x = \
        self.model(self.X, self.w_h1, self.w_h2, self.w_out, P_drop_i, P_drop_h)
        self.h1, self.h2, self.py_x = \
        self.model(self.X, self.w_h1, self.w_h2, self.w_out, 0., 0.)
        self.y_x_class = T.argmax(self.py_x, axis=1)
        self.y_x_value = T.max(self.py_x, axis=1)
        print "Initializing Model"
        
        #Cost and Updates
        self.cost = T.mean(T.nnet.categorical_crossentropy(self.noise_py_x, self.Y))
        self.weights = [self.w_h1, self.w_h2, self.w_out]
        self.updates = self.RMSprop(self.cost, self.weights, lr=lr)
        print "Cost and Updates Done!"
        
        #Compile to Theano functions
        self.train = theano.function(inputs = [self.X, self.Y],
                                     outputs = self.cost,
                                     updates = self.updates,
                                     allow_input_downcast = True)
        self.predict = theano.function(inputs = [self.X],
                                             outputs = [self.y_x_class, 
                                                        self.y_x_value,
                                                        self.py_x], 
                                             allow_input_downcast = True)
        print "Traing and Predicting Function Complete!"
        
        



In [95]:
#1010!
import random

class Game(object):
    def __init__(self):
        self.state = [[0 for i in range(10)] for j in range(10)]
        self.tiles = [[(0,0), (0,1), (0,2), (1,0), (1,1), (1,2), (2,0), (2,1), (2,2)],
                      [(0,0), (1,0), (2,0), (2,1), (2,2)],
                      [(0,0), (0,1), (0,2)],
                      [(0,0), (1,0), (2,0)],
                      [(0,0)],
                      [(0,0), (0,1)],
                      [(0,0), (1,0)],
                      [(0,0), (0,1), (1,0), (1,1)],
                      [(0,0), (0,1), (0,2), (1,0), (2,0)],
                      [(0,0), (0,1), (0,2), (1,2), (2,2)],
                      [(0,2), (1,2), (2,0), (2,1), (2,2)],
                      [(0,0), (0,1), (0,2), (0,3)],
                      [(0,0), (1,0), (2,0), (3,0)],
                      [(0,0), (1,0), (2,0), (3,0), (4,0)],
                      [(0,1), (1,0), (1,1)],
                      [(0,0), (0,1), (1,0)],
                      [(0,0), (0,1), (1,1)],
                      [(0,0), (1,0), (1,1)],
                      [(0,0), (0,1), (0,2), (0,3), (0,4)]]
        self.combo_score = [0 for i in range(10)]
        self.score = 0
        self.terminal = False
        self.valid_tile = [random.randint(0, 18) for i in range(3)]
        #Real Scores XD
        for i in xrange(1, 10):
            self.combo_score[i] = self.combo_score[i-1] + 10*i
        
    #Validate action
    def valid_action(self, action, tile):
        row, col = action
        valid_flag = True
        for dr, dc in self.tiles[tile]:
            if(row + dr > 9 or col + dc > 9 or self.state[row + dr][col + dc] == 1):
                valid_flag = False
                break

        return valid_flag
            
    #Clear valid lines
    def update_state(self, clear_row, clear_col):
        for row in clear_row:
            for col in range(10):
                self.state[row][col] = 0
        
        for col in clear_col:
            for row in range(10):
                self.state[row][col] = 0
    
    #Calculate and clear lines
    def check_score(self):
        
        clear_lines = 0
        clear_row = []
        clear_col = []
        
        for row in xrange(10):
            row_flag = True
            for col in xrange(10):
                if(self.state[row][col] == 0):
                    row_flag = False
                    break
                
            if(row_flag == True):
                clear_row.append(row)
                clear_lines += 1
        
        for col in xrange(10):
            col_flag = True
            for row in xrange(10):
                if(self.state[row][col] == 0):
                    col_flag = False
                    break
            
            if(col_flag == True):
                clear_col.append(col)
                clear_lines += 1
        
        self.update_state(clear_row, clear_col)
        
        return self.combo_score[clear_lines]
        
    #Add the tile on specified coordinate, returns score
    def add_tile(self, action, tile):
        row, col = action
        if(self.valid_action(action, tile) == True):
            for dr, dc in self.tiles[tile]:
                self.state[row + dr][col + dc] = 1;
            
            return len(self.tiles[tile]) + self.check_score()
        
        else:
            return False
       
    def start_game(self):
        #print "Game Starts!"
        self.__init__()
        return self.score, self.state, self.valid_tile, self.terminal
    
    def end_game(self):
        end_flag = True
        for tile in self.valid_tile:
            for row in xrange(10):
                for col in xrange(10):
                    if(self.valid_action((row, col), tile)):
                        end_flag = False
                    
        return end_flag
    
    def do_action(self, action, tile):
        
        if(not tile in self.valid_tile):
            print "Invalid Tile"
        else:
            temp_score = self.add_tile(action, tile)
            if(temp_score == False):
                print "Invalid Action"   
            else:
                self.score += temp_score
                self.valid_tile.remove(tile)
        
        if(self.valid_tile == []):
            self.valid_tile = [random.randint(0, 18) for i in range(3)]
        
        if(self.end_game() == True):
            self.terminal = True
            print "Game Over"
        
        return self.score, self.state, self.valid_tile, self.terminal
    

        




In [140]:
#Deep Q Learning

def state_to_input(state, valid_tile):
    state = np.array(state)
    state_input = state.flatten()
    tile_input = np.zeros((19,))
    for i in valid_tile:
        tile_input[i] += 1
    state_input = np.append(state_input, tile_input)
    return state_input

def output_to_action(output, tiles):
    tile_idx = output/100
    output = output % 100
    act = (output/10, output%10)
    return act, tiles[tile_idx]

class DeepQ(object):
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.game = Game()
        self.nn = MLP(n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h)
        self.exp = []

    def learn(self, n_epoch, exp_len, time_len, eps, gamma):
        for e in xrange(n_epoch):
            
            #Start epoch
            score, state, valid_tile, terminal = self.game.start_game()
            
            for t in xrange(time_len):
                
                #Epsilon greedy
                if(random.random() < eps):
                    policy_output = random.randint(0,300)
                    a_t, tile_t = output_to_action(policy_output, valid_tile)
                    
                else:
                    pred = self.nn.predict([state_to_input(state, valid_tile)])
                    policy_output = pred[0][0]
                    a_t, tile_t = output_to_action(policy_output, valid_tile)
                    
                score_t, state_t, valid_tile_t, terminal = self.game.do_action(a_t, tile_t)
                reward = (score_t - score)
                
                #Store Experience
                self.exp.append({'state':state, 
                                 'valid_tile':valid_tile, 
                                 'policy_output':policy_output,
                                 'reward':reward,
                                 'state_t':state_t,
                                 'valid_tile_t':valid_tile_t,
                                 'terminal':terminal})
                score = score_t, state = state_t, valid_tile = valid_tile
                
                #Experience Replay
                if(len(self.exp) > exp_len):
                    trX = [], trY = []
                    startX = random.randint(0, exp_len - batch_size)
                    for xp in xrange(startX, startX + batch_size):
                        if(xp['terminal'] == True):
                            Q = reward
                        else:
                            pred = self.nn.predict([state_to_input(xp['state_t'],
                                                                   xp['valid_tile_t'])])
                            Q = reward + gamma * pred[1][0]
                        
                        y = pred[2], y[xp[pout]] = Q
                        trX.append(state_to_input(xp[s], xp[vtl]))
                        trY.apppend(y)
                    
                    trX = np.asarray(trX)
                    trY = np.asarray(trY)
                    self.nn.train(trX, trY)
                
                print "Traing Epoch:%d" % e
                if(terminal == True):
                    break
    
    def play(self, epoch):
        
        for e in xrange(epoch):
            score, state, valid_tile, terminal = self.game.start()
            while(True):
                pred = self.nn.predict([state_to_input(state, valid_tile)])
                act, tile = output_to_action(pred[0], valid_tile) 
                score, state, valid_tile, terminal = self.game.do_action(act, tile)
                if(terminal == True):
                    break
            print "Playing epoch:%d" % e
            print "Score:" + score
        
        
        

In [141]:
Learner = DeepQ(119, 128, 128, 300, 0.001, 0, 0)
Learner.learn(1000, 1000, 3000, 0.2, 0.9)


Initializing Weights
Initializing Model
Cost and Updates Done!
Traing and Predicting Function Complete!
Traing Epoch:0


TypeError: 'int' object has no attribute '__getitem__'

In [135]:
a = np.zeros((3, 10))
print a

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [136]:
num = T.iscalar()

f1 = num**2
f2 = num+2
a = theano.function(inputs = [num], outputs = [f1, f2])

print a([12])

TypeError: ('Bad input argument to theano function with name "<ipython-input-136-5ef3e982ac70>:5"  at index 0(0-based)', 'Wrong number of dimensions: expected 0, got 1 with shape (1,).')

In [139]:
def f(a, b):
    return (a,1), b[1]

x, y = f(1,[2,3])

print x
print y

(1, 1)
3
