In [29]:
import numpy as np
import theano
import theanets # autoencoders
from scipy import misc
import math
import os.path

In [2]:
def crop(img):
    top, bottom, left, right = 30, -20, 15, -15
    return img[top:bottom, left:right]

In [3]:
def distance(c1, c2):
    (r1,g1,b1) = c1
    (r2,g2,b2) = c2
    return math.sqrt((r1 - r2)**2 + (g1 - g2) ** 2 + (b1 - b2) **2)

In [4]:
hit_map = {(255, 255, 255) : 0, # white nothing
           (0, 0, 0) : 0,       # homerun. not defensible
           (255, 0, 0) : 1,     # linedrive, laser
           (0, 255, 0) : 2,     # groundball  
           (0, 0, 255) : 3,     # flyball
           (160, 32, 240) : 4}  # blooper

colors = hit_map.keys()
    
def norm_color(rgb):
    rgb_key = tuple(rgb)
    
    if rgb_key in colors:
        return hit_map[rgb_key]
    else:    
        sc = sorted(colors, key=lambda color: distance(color, rgb))
        return hit_map[sc[0]]

In [44]:
def process_img(img_file):
    img = misc.imread('charts/pros/{}'.format(img_file), mode='RGB')
    img = misc.imresize(img, size=15)
    img = crop(img)
    img = np.array([[norm_color(rgb) for rgb in row] for row in img])
    return img.reshape((1, 11349))

In [48]:
player_map = {img_file[:-4] : process_img(img_file) 
              for img_file in os.listdir('charts/pros/')}
print player_map

{'Kris_Bryant': array([[0, 0, 0, ..., 0, 0, 0]]), 'Mike_Trout': array([[0, 0, 0, ..., 0, 0, 0]]), 'Brian_Dozier': array([[0, 0, 0, ..., 0, 0, 0]]), 'Miguel_Cabrera': array([[0, 0, 0, ..., 0, 0, 0]]), 'Manny_Machado': array([[0, 0, 0, ..., 0, 0, 0]]), 'Ian_Kinsler': array([[0, 0, 0, ..., 0, 0, 0]]), 'Mookie_Betts': array([[0, 0, 0, ..., 0, 0, 0]]), 'Justin_Turner': array([[0, 0, 0, ..., 0, 0, 0]]), 'Nolan_Arenado': array([[0, 0, 0, ..., 0, 0, 0]]), 'Adrian_Beltre': array([[0, 0, 0, ..., 0, 0, 0]]), 'Josh_Donaldson': array([[0, 0, 0, ..., 0, 0, 0]]), 'Dustin_Pedroia': array([[0, 0, 0, ..., 0, 0, 0]]), 'Xander_Bogaerts': array([[0, 0, 0, ..., 0, 0, 0]]), 'Jose_Altuve': array([[0, 0, 0, ..., 0, 0, 0]]), 'Paul_Goldschmidt': array([[0, 0, 0, ..., 0, 0, 0]])}


## One Hidden Layer

Autoencoder with only one hidden layer, the dimensions in the data captured by the
autoencoder model approximate the results of Principal Component Analysis (PCA). However, an autoencoder behaves much differently if there is non-linearity involved. And this csae study is very much non-linear. The autoencoder will detect different latent factors that PCA will never be able to detect.

In [49]:
def get_non_linear_pca(data):
    nonlinear_pca_model = theanets.Autoencoder([11349, (16,'relu'), 11349])
    nonlinear_pca_model.train([data], algo='rmsprop', input_noise=0.1, hidden_l1=.001, sparsity=0.9, num_updates=1000)
    return nonlinear_pca_model.encode(data)

In [51]:
%%capture
pca_map = [{player : get_non_linear_pca(data)} 
           for player, data in player_map.iteritems()]

In [52]:
print pca_map

[{'Kris_Bryant': array([[ 19.14107018,  16.77442651,   0.        ,  13.27170813,
         18.54852151,   0.        ,  10.82832315,   0.        ,
          0.        ,  15.41038945,  12.1842489 ,   3.54790086,
         16.0049916 ,   0.        ,   0.        ,   0.        ]])}, {'Mike_Trout': array([[ 20.15692611,  19.77078369,   0.        ,   9.47814634,
         17.95032803,   3.30983958,   0.        ,   0.        ,
         18.08122375,  16.25217339,   4.95630184,  12.4094523 ,
         17.84088484,   0.        ,   0.        ,   0.        ]])}, {'Brian_Dozier': array([[ 23.90217854,  18.69523635,   0.        ,   0.        ,
         18.80582489,   0.        ,   0.        ,   0.        ,
         15.31437425,  18.61607451,  21.00674342,  11.74648036,
         20.99122025,   0.        ,   0.        ,   0.        ]])}, {'Miguel_Cabrera': array([[ 17.57309957,  17.02422026,   0.        ,  13.53140452,
         17.41240587,  17.25003005,   0.        ,   0.        ,
          0.        ,  1

## Deeper Jeeper Creepers

More hidden layers, where the "black box" mystique of deep learners come in play. How they get the deep features or what these features even mean, is at best, a deep dark mystery. After some devoted research we can make head and tails of it. Even make a more sophiscated deep learner to figure it out, but then again, we won't know how it got it conclusions. This is the "devil in the details" dilemma, they are always one step ahead of us.

In [13]:
%%capture
deep_model = theanets.Autoencoder([11349, (16,'relu'), (16,'relu'), 11349])
deep_model.train([mb], algo='rmsprop', input_noise=0.1, hidden_l1=.001, sparsity=0.9, num_updates=1000)

(OrderedDict([('loss', 0.0057253627638351101),
              ('err', 0.00055274567694036487)]),
 OrderedDict([('loss', 0.005441380249301425),
              ('err', 0.00014149329847880525)]))

In [14]:
deep_features = deep_model.encode(mb)
print(deep_features)

[[ 14.23102389  11.36744099   0.           0.           0.          18.55435917
    2.14325471   0.           9.25160875   0.           0.           0.
    0.          34.6439092    7.7917971    0.        ]]
