In [36]:
#Modern Neural Network, adapted from https://github.com/Newmu/Theano-Tutorials

import numpy as np
import theano
import random
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype = theano.config.floatX)

def rectify(X):
    return T.maximum(X, 0.)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))


class MLP(object):
    
    #Simplified RMSprop
    def RMSprop(self, cost, weights, lr = 0.001, rho = 0.9, eps = 1e-6):
        grads = T.grad(cost = cost, wrt = weights)
        updates = []
        
        for w, g in zip(weights, grads):
            acc = theano.shared(w.get_value()*0.)
            acc_new = rho * acc + (1 - rho) * g**2
            grad_scale = T.sqrt(acc_new + eps)
            g = g/grad_scale
            updates.append((acc, acc_new))
            updates.append((w, w - lr*g))
        
        return updates    
    
    #Random dropout for noise
    def dropout(self, X, P = 0.):
        if P > 0:
            X *= srng.binomial(X.shape, p = (1-P), dtype = theano.config.floatX)
            X /= (1-P)
        
        return X
        
    #Two hidden layer model
    def model(self, X, w_h1, w_h2, w_out, P_drop_i, P_drop_h):
        X = self.dropout(X, P_drop_i)
        h1 = rectify(T.dot(X, w_h1))
        
        h1 = self.dropout(h1, P_drop_h)
        h2 = rectify(T.dot(h1, w_h2))
        
        h2 = self.dropout(h2, P_drop_h)
        py_x = T.dot(h2, w_out)
        
        return h1, h2, py_x
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.X = T.fmatrix()
        self.Y = T.fmatrix()
        
        #Init weights
        self.w_h1 = init_weights((n_in, n_h1))
        self.w_h2 = init_weights((n_h1, n_h2))
        self.w_out = init_weights((n_h2, n_out))
        print "Initializing Weights"
        #Init model
        self.noise_h, self.noise_h2, self.noise_py_x = \
        self.model(self.X, self.w_h1, self.w_h2, self.w_out, P_drop_i, P_drop_h)
        self.h1, self.h2, self.py_x = \
        self.model(self.X, self.w_h1, self.w_h2, self.w_out, 0., 0.)
        self.y_x_class = T.argmax(self.py_x, axis=1)
        self.y_x_value = T.max(self.py_x, axis=1)
        print "Initializing Model"
        
        #Cost and Updates
        self.cost = T.mean(T.nnet.categorical_crossentropy(self.noise_py_x, self.Y))
        self.weights = [self.w_h1, self.w_h2, self.w_out]
        self.updates = self.RMSprop(self.cost, self.weights, lr=lr)
        print "Cost and Updates Done!"
        
        #Compile to Theano functions
        self.train = theano.function(inputs = [self.X, self.Y],
                                     outputs = self.cost,
                                     updates = self.updates,
                                     allow_input_downcast = True)
        self.predict = theano.function(inputs = [self.X],
                                             outputs = [self.y_x_class, 
                                                        self.y_x_value,
                                                        self.py_x], 
                                             allow_input_downcast = True)
        print "Traing and Predicting Function Complete!"
        
        



In [49]:
#1010! Game with modifications
import random

class Game(object):
    def __init__(self):
        self.state = [[0 for i in range(10)] for j in range(10)]
        self.tiles = [[(0,0), (0,1), (0,2), (1,0), (1,1), (1,2), (2,0), (2,1), (2,2)],
                      [(0,0), (1,0), (2,0), (2,1), (2,2)],
                      [(0,0), (0,1), (0,2)],
                      [(0,0), (1,0), (2,0)],
                      [(0,0)],
                      [(0,0), (0,1)],
                      [(0,0), (1,0)],
                      [(0,0), (0,1), (1,0), (1,1)],
                      [(0,0), (0,1), (0,2), (1,0), (2,0)],
                      [(0,0), (0,1), (0,2), (1,2), (2,2)],
                      [(0,2), (1,2), (2,0), (2,1), (2,2)],
                      [(0,0), (0,1), (0,2), (0,3)],
                      [(0,0), (1,0), (2,0), (3,0)],
                      [(0,0), (1,0), (2,0), (3,0), (4,0)],
                      [(0,1), (1,0), (1,1)],
                      [(0,0), (0,1), (1,0)],
                      [(0,0), (0,1), (1,1)],
                      [(0,0), (1,0), (1,1)],
                      [(0,0), (0,1), (0,2), (0,3), (0,4)]]
        self.combo_score = [0 for i in range(10)]
        self.score = 0
        self.terminal = False
        self.valid_tile = [random.randint(0, 18) for i in range(3)]
        #Real Scores XD
        for i in xrange(1, 10):
            self.combo_score[i] = self.combo_score[i-1] + 10*i
        
    #Validate action
    def valid_action(self, action, tile):
        row, col = action
        valid_flag = True
        
        if(not tile in self.valid_tile):
            valid_flag = False
            return valid_flag
        
        for dr, dc in self.tiles[tile]:
            if(row + dr > 9 or col + dc > 9 or self.state[row + dr][col + dc] == 1):
                valid_flag = False
                break

        return valid_flag
            
    #Clear valid lines
    def update_state(self, clear_row, clear_col):
        for row in clear_row:
            for col in range(10):
                self.state[row][col] = 0
        
        for col in clear_col:
            for row in range(10):
                self.state[row][col] = 0
    
    #Calculate and clear lines
    def check_score(self):
        
        clear_lines = 0
        clear_row = []
        clear_col = []
        
        for row in xrange(10):
            row_flag = True
            for col in xrange(10):
                if(self.state[row][col] == 0):
                    row_flag = False
                    break
                
            if(row_flag == True):
                clear_row.append(row)
                clear_lines += 1
        
        for col in xrange(10):
            col_flag = True
            for row in xrange(10):
                if(self.state[row][col] == 0):
                    col_flag = False
                    break
            
            if(col_flag == True):
                clear_col.append(col)
                clear_lines += 1
        
        self.update_state(clear_row, clear_col)
        
        return self.combo_score[clear_lines]
        
    #Add the tile on specified coordinate, returns score
    def add_tile(self, action, tile):
        row, col = action
        if(self.valid_action(action, tile) == True):
            for dr, dc in self.tiles[tile]:
                self.state[row + dr][col + dc] = 1;
            
            return len(self.tiles[tile]) + self.check_score()
        
        else:
            return False
       
    def start_game(self):
        #print "Game Starts!"
        self.__init__()
        return self.score, self.state, self.valid_tile, self.terminal
    
    def end_game(self):
        end_flag = True
        for tile in self.valid_tile:
            for row in xrange(10):
                for col in xrange(10):
                    if(self.valid_action((row, col), tile)):
                        end_flag = False
                    
        return end_flag
    
    def do_action(self, action, tile):
        
        if(not tile in self.valid_tile):
            #print "Invalid Tile"
            pass
        else:
            temp_score = self.add_tile(action, tile)
            if(temp_score == False):
                #print "Invalid Action"
                pass
            else:
                self.score += temp_score
                self.valid_tile.remove(tile)
        
        if(self.valid_tile == []):
            self.valid_tile = [random.randint(0, 18) for i in range(3)]
        
        if(self.end_game() == True):
            self.terminal = True
            print "Game Over"
        
        return self.score, self.state, self.valid_tile, self.terminal
    

        




In [55]:
#Deep Q Learning

def state_to_input(state, tiles):
    state = np.array(state)
    state_input = state.flatten()
    tile_input = np.zeros((19,))
    if type(tiles) == list:
        for i in tiles:
            tile_input[i] += 1
    
    state_input = np.append(state_input, tile_input)
    return state_input

def output_to_action(output, tiles):
    tile_idx = output/100
    output = output%100
    act = (output/10, output%10)
    tile = 0
    
    if type(tiles) == list:
        if(tile_idx > len(tiles)-1):
            tile = 19
        else:
            tile = tiles[tile_idx]
    
    return act, tile

class DeepQ(object):
    
    def __init__(self, n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h):
        self.game = Game()
        self.nn = MLP(n_in, n_h1, n_h2, n_out, lr, P_drop_i, P_drop_h)
        self.exp = []

    def learn(self, n_epoch, exp_len, time_len, eps, gamma):
        for e in xrange(n_epoch):
            
            #Start epoch
            score, state, valid_tile, terminal = self.game.start_game()
            
            for t in xrange(time_len):
                
                #Epsilon greedy
                policy_output = 0
                a_t = (0,0)
                tile_t = 0
                if(random.random() < eps):
                    for policy_t in xrange(300):
                        a_t, tile_t = output_to_action(policy_t, valid_tile)
                        if(self.game.valid_action(a_t, tile_t)):
                            policy_output = policy_t
                            break
                else:
                    pred = self.nn.predict([state_to_input(state, valid_tile)])
                    policy_outputs = sorted(range(len(pred[2][0])), key = lambda k:pred[2][0][k])
                    #Choose Valid Policy with Max Q Value
                    for policy_t in policy_outputs:
                        a_t, tile_t = output_to_action(policy_t, valid_tile)
                        if(self.game.valid_action(a_t, tile_t)):
                            policy_output = policy_t
                            break
                    
                score_t, state_t, valid_tile_t, terminal = self.game.do_action(a_t, tile_t)
                reward = (score_t - score) - 0.5
                
                #Store Experience
                self.exp.append({'state':state, 
                                 'valid_tile':valid_tile, 
                                 'policy_output':policy_output,
                                 'reward':reward,
                                 'state_t':state_t,
                                 'valid_tile_t':valid_tile_t,
                                 'terminal':terminal})
                score = score_t
                state = state_t
                valid_tile = valid_tile_t
                
                #Experience Replay
                if(len(self.exp) > exp_len):
                    trX = []
                    trY = []
                    i = random.randint(0, exp_len-1)
                    if(self.exp[i]['terminal'] == True):
                        Q = reward
                    else:
                        pred = self.nn.predict([state_to_input(self.exp[i]['state_t'],
                                                               self.exp[i]['valid_tile_t'])])
                        Q = reward + gamma * pred[1][0]
                        
                    y = pred[2][0]
                    y[pred[0][0]] = Q
                    del self.exp[i]
                    trX.append(state_to_input(self.exp[i]['state'], self.exp[i]['valid_tile']))
                    trY.append(y)
                    
                    trX = np.asarray(trX)
                    trY = np.asarray(trY)
                    self.nn.train(trX, trY)
                    
                if(terminal == True):
                    break
            
            print "Learning Epoch: %d, Score: %d" % (e, score)
    
    def play(self, epoch):
        
        for e in xrange(epoch):
            score, state, valid_tile, terminal = self.game.start_game()
            while(True):
                pred = self.nn.predict([state_to_input(state, valid_tile)])
                policy_outputs = sorted(range(len(pred[2][0])), key = lambda k:pred[2][0][k])
                policy_output = 0
                for policy_t in policy_outputs:
                    a_t, tile_t = output_to_action(policy_t, valid_tile)
                    if(self.game.valid_action(a_t, tile_t)):
                        policy_output = policy_t
                    
                    act, tile = output_to_action(policy_output, valid_tile)
                
                score, state, valid_tile, terminal = self.game.do_action(act, tile)
                
                if(terminal == True):
                    break
                    
            print "Playing epoch: %d, Score: %d" % (e, score)
            
        
        
        

In [61]:
Learner = DeepQ(119, 128, 128, 300, 0.001, 0, 0)
Learner.learn(10000, 100, 3000, 0.2, 0.9)


Initializing Weights
Initializing Model
Cost and Updates Done!
Traing and Predicting Function Complete!
Game Over
Learning Epoch: 0, Score: 59
Game Over
Learning Epoch: 1, Score: 60
Game Over
Learning Epoch: 2, Score: 111
Game Over
Learning Epoch: 3, Score: 85
Game Over
Learning Epoch: 4, Score: 112
Game Over
Learning Epoch: 5, Score: 109
Game Over
Learning Epoch: 6, Score: 67
Game Over
Learning Epoch: 7, Score: 95
Game Over
Learning Epoch: 8, Score: 120
Game Over
Learning Epoch: 9, Score: 102
Game Over
Learning Epoch: 10, Score: 64
Game Over
Learning Epoch: 11, Score: 81
Game Over
Learning Epoch: 12, Score: 54
Game Over
Learning Epoch: 13, Score: 88
Game Over
Learning Epoch: 14, Score: 105
Game Over
Learning Epoch: 15, Score: 133
Game Over
Learning Epoch: 16, Score: 70
Game Over
Learning Epoch: 17, Score: 106
Game Over
Learning Epoch: 18, Score: 114
Game Over
Learning Epoch: 19, Score: 69
Game Over
Learning Epoch: 20, Score: 78
Game Over
Learning Epoch: 21, Score: 97
Game Over
Learnin

In [67]:
Learner.play(30)


Game Over
Playing epoch: 0, Score: 72
Game Over
Playing epoch: 1, Score: 79
Game Over
Playing epoch: 2, Score: 65
Game Over
Playing epoch: 3, Score: 53
Game Over
Playing epoch: 4, Score: 101
Game Over
Playing epoch: 5, Score: 57
Game Over
Playing epoch: 6, Score: 42
Game Over
Playing epoch: 7, Score: 99
Game Over
Playing epoch: 8, Score: 109
Game Over
Playing epoch: 9, Score: 66
Game Over
Playing epoch: 10, Score: 51
Game Over
Playing epoch: 11, Score: 106
Game Over
Playing epoch: 12, Score: 130
Game Over
Playing epoch: 13, Score: 57
Game Over
Playing epoch: 14, Score: 124
Game Over
Playing epoch: 15, Score: 147
Game Over
Playing epoch: 16, Score: 63
Game Over
Playing epoch: 17, Score: 100
Game Over
Playing epoch: 18, Score: 101
Game Over
Playing epoch: 19, Score: 84
Game Over
Playing epoch: 20, Score: 71
Game Over
Playing epoch: 21, Score: 70
Game Over
Playing epoch: 22, Score: 53
Game Over
Playing epoch: 23, Score: 48
Game Over
Playing epoch: 24, Score: 83
Game Over
Playing epoch: 25

In [63]:
print Learner.nn.w_h1.get_value()
print Learner.nn.w_h2.get_value()
print Learner.nn.w_out.get_value()

[[ 170.23105626  170.23787408  169.47879325 ...,  170.26145278
   169.99255773  170.14727195]
 [ 168.70059408  168.71516383  167.92442074 ...,  168.71264728
   168.47380279  168.6150702 ]
 [ 165.28948737  165.3127802   164.59374226 ...,  165.29741557  165.0913664
   165.22081788]
 ..., 
 [   0.46385151    0.45629235    0.47339729 ...,    0.45140109
     0.46170049    0.48279027]
 [   0.52220536    0.54539745    0.52044466 ...,    0.53635874
     0.52469529    0.51002911]
 [   5.61574315    5.60226692    5.59938584 ...,    5.6174079     5.61128454
     5.63764598]]
[[ -4.96465174e-03   1.95449897e+02   1.95456913e+02 ...,  -1.86564994e-03
    1.25867455e-02   1.95426265e+02]
 [ -4.27570581e-03   1.95460409e+02   1.95483340e+02 ...,   4.74891720e-03
   -2.38405920e-04   1.95408493e+02]
 [ -1.01274978e-02   1.94584705e+02   1.94605857e+02 ...,  -1.69554990e-03
   -1.29047819e-02   1.94589487e+02]
 ..., 
 [ -3.51002477e-04   1.95445722e+02   1.95471666e+02 ...,  -1.24337283e-02
   -4.41608

In [68]:
Learner.learn(10000, 100, 3000, 0.2, 0.9)

Game Over
Learning Epoch: 0, Score: 91
Game Over
Learning Epoch: 1, Score: 114
Game Over
Learning Epoch: 2, Score: 66
Game Over
Learning Epoch: 3, Score: 77
Game Over
Learning Epoch: 4, Score: 109
Game Over
Learning Epoch: 5, Score: 130
Game Over
Learning Epoch: 6, Score: 72
Game Over
Learning Epoch: 7, Score: 93
Game Over
Learning Epoch: 8, Score: 370
Game Over
Learning Epoch: 9, Score: 59
Game Over
Learning Epoch: 10, Score: 90
Game Over
Learning Epoch: 11, Score: 77
Game Over
Learning Epoch: 12, Score: 125
Game Over
Learning Epoch: 13, Score: 71
Game Over
Learning Epoch: 14, Score: 70
Game Over
Learning Epoch: 15, Score: 85
Game Over
Learning Epoch: 16, Score: 83
Game Over
Learning Epoch: 17, Score: 108
Game Over
Learning Epoch: 18, Score: 174
Game Over
Learning Epoch: 19, Score: 98
Game Over
Learning Epoch: 20, Score: 113
Game Over
Learning Epoch: 21, Score: 92
Game Over
Learning Epoch: 22, Score: 176
Game Over
Learning Epoch: 23, Score: 111
Game Over
Learning Epoch: 24, Score: 133

In [69]:
print Learner.nn.w_h1.get_value()
print Learner.nn.w_h2.get_value()
print Learner.nn.w_out.get_value()


[[ 341.5076519   341.51446976  340.75540502 ...,  341.53804853
   341.26915767  341.42386933]
 [ 337.75556163  337.77013141  336.97940411 ...,  337.76761494
   337.52877456  337.67003945]
 [ 332.80108362  332.8243765   332.10535411 ...,  332.80901193
   332.60296681  332.73241582]
 ..., 
 [   0.91656518    0.90900602    0.92611096 ...,    0.90411476
     0.91441416    0.93550394]
 [   1.05013468    1.07332678    1.04837399 ...,    1.06428806
     1.05262461    1.03795844]
 [  11.44379868   11.43032245   11.42744145 ...,   11.44546344
    11.43934009   11.46570152]]
[[ -4.96465174e-03   3.93929453e+02   3.93936468e+02 ...,  -1.86564994e-03
    1.25867455e-02   3.93905822e+02]
 [ -4.27570581e-03   3.93939975e+02   3.93962906e+02 ...,   4.74891720e-03
   -2.38405920e-04   3.93888060e+02]
 [ -1.01274978e-02   3.93064281e+02   3.93085433e+02 ...,  -1.69554990e-03
   -1.29047819e-02   3.93069064e+02]
 ..., 
 [ -3.51002477e-04   3.93925280e+02   3.93951224e+02 ...,  -1.24337283e-02
   -4.4160

In [70]:
Learner.play(30)

Game Over
Playing epoch: 0, Score: 64
Game Over
Playing epoch: 1, Score: 54
Game Over
Playing epoch: 2, Score: 107
Game Over
Playing epoch: 3, Score: 65
Game Over
Playing epoch: 4, Score: 104
Game Over
Playing epoch: 5, Score: 63
Game Over
Playing epoch: 6, Score: 151
Game Over
Playing epoch: 7, Score: 156
Game Over
Playing epoch: 8, Score: 77
Game Over
Playing epoch: 9, Score: 63
Game Over
Playing epoch: 10, Score: 50
Game Over
Playing epoch: 11, Score: 132
Game Over
Playing epoch: 12, Score: 56
Game Over
Playing epoch: 13, Score: 83
Game Over
Playing epoch: 14, Score: 121
Game Over
Playing epoch: 15, Score: 51
Game Over
Playing epoch: 16, Score: 107
Game Over
Playing epoch: 17, Score: 67
Game Over
Playing epoch: 18, Score: 152
Game Over
Playing epoch: 19, Score: 49
Game Over
Playing epoch: 20, Score: 60
Game Over
Playing epoch: 21, Score: 100
Game Over
Playing epoch: 22, Score: 65
Game Over
Playing epoch: 23, Score: 136
Game Over
Playing epoch: 24, Score: 89
Game Over
Playing epoch: 