In [1]:
import numpy as np
import theano
import theanets # autoencoders
from sklearn.neighbors import KNeighborsClassifier
from scipy import misc
import math
import os.path

In [2]:
def crop(img):
    top, bottom, left, right = 30, -20, 15, -15
    return img[top:bottom, left:right]

In [3]:
def distance(c1, c2):
    (r1,g1,b1) = c1
    (r2,g2,b2) = c2
    return math.sqrt((r1 - r2)**2 + (g1 - g2) ** 2 + (b1 - b2) **2)

In [4]:
hit_map = {(255, 255, 255) : 0, # white nothing
           (0, 0, 0) : 0,       # homerun. not defensible
           (255, 0, 0) : 1,     # linedrive, laser
           (0, 255, 0) : 2,     # groundball  
           (0, 0, 255) : 3,     # flyball
           (160, 32, 240) : 4}  # blooper

colors = hit_map.keys()
    
def norm_color(rgb):
    rgb_key = tuple(rgb)
    
    if rgb_key in colors:
        return hit_map[rgb_key]
    else:    
        sc = sorted(colors, key=lambda color: distance(color, rgb))
        return hit_map[sc[0]]

In [5]:
def process_img(img_file):
    img = misc.imread('charts/pros/{}'.format(img_file), mode='RGB')
    img = misc.imresize(img, size=15)
    img = crop(img)
    img = np.array([[norm_color(rgb) for rgb in row] for row in img])
    return img.reshape((1, 11349))

In [6]:
player_map = {img_file[:-4] : process_img(img_file) 
              for img_file in os.listdir('charts/pros/')}
print player_map

{'Kris_Bryant': array([[0, 0, 0, ..., 0, 0, 0]]), 'Mike_Trout': array([[0, 0, 0, ..., 0, 0, 0]]), 'Brian_Dozier': array([[0, 0, 0, ..., 0, 0, 0]]), 'Miguel_Cabrera': array([[0, 0, 0, ..., 0, 0, 0]]), 'Manny_Machado': array([[0, 0, 0, ..., 0, 0, 0]]), 'Ian_Kinsler': array([[0, 0, 0, ..., 0, 0, 0]]), 'Mookie_Betts': array([[0, 0, 0, ..., 0, 0, 0]]), 'Justin_Turner': array([[0, 0, 0, ..., 0, 0, 0]]), 'Nolan_Arenado': array([[0, 0, 0, ..., 0, 0, 0]]), 'Adrian_Beltre': array([[0, 0, 0, ..., 0, 0, 0]]), 'Josh_Donaldson': array([[0, 0, 0, ..., 0, 0, 0]]), 'Dustin_Pedroia': array([[0, 0, 0, ..., 0, 0, 0]]), 'Xander_Bogaerts': array([[0, 0, 0, ..., 0, 0, 0]]), 'Jose_Altuve': array([[0, 0, 0, ..., 0, 0, 0]]), 'Paul_Goldschmidt': array([[0, 0, 0, ..., 0, 0, 0]])}


In [7]:
player_class = {i : player 
                for i, player in enumerate(player_map.keys())}
print player_class

{0: 'Kris_Bryant', 1: 'Mike_Trout', 2: 'Brian_Dozier', 3: 'Miguel_Cabrera', 4: 'Manny_Machado', 5: 'Ian_Kinsler', 6: 'Mookie_Betts', 7: 'Justin_Turner', 8: 'Nolan_Arenado', 9: 'Adrian_Beltre', 10: 'Josh_Donaldson', 11: 'Dustin_Pedroia', 12: 'Xander_Bogaerts', 13: 'Jose_Altuve', 14: 'Paul_Goldschmidt'}


## One Hidden Layer

Autoencoder with only one hidden layer, the dimensions in the data captured by the
autoencoder model approximate the results of Principal Component Analysis (PCA). However, an autoencoder behaves much differently if there is non-linearity involved. And this case study is very much non-linear. The autoencoder will detect different latent factors that PCA will never be able to detect.

In [8]:
def get_non_linear_pca(data):
    nonlinear_pca_model = theanets.Autoencoder([11349, (16,'relu'), 11349])
    nonlinear_pca_model.train([data], algo='rmsprop', input_noise=0.1, hidden_l1=.001, sparsity=0.9, num_updates=1000)
    return nonlinear_pca_model.encode(data)

In [9]:
%%capture
pca_map = {player : get_non_linear_pca(data) 
           for player, data in player_map.iteritems()}

In [10]:
print pca_map

{'Kris_Bryant': array([[ 19.14107018,  16.77442651,   0.        ,  13.27170813,
         18.54852151,   0.        ,  10.82832315,   0.        ,
          0.        ,  15.41038945,  12.1842489 ,   3.54790086,
         16.0049916 ,   0.        ,   0.        ,   0.        ]]), 'Mike_Trout': array([[ 20.15692611,  19.77078369,   0.        ,   9.47814634,
         17.95032803,   3.30983958,   0.        ,   0.        ,
         18.08122375,  16.25217339,   4.95630184,  12.4094523 ,
         17.84088484,   0.        ,   0.        ,   0.        ]]), 'Brian_Dozier': array([[ 23.90217854,  18.69523635,   0.        ,   0.        ,
         18.80582489,   0.        ,   0.        ,   0.        ,
         15.31437425,  18.61607451,  21.00674342,  11.74648036,
         20.99122025,   0.        ,   0.        ,   0.        ]]), 'Miguel_Cabrera': array([[ 17.57309957,  17.02422026,   0.        ,  13.53140452,
         17.41240587,  17.25003005,   0.        ,   0.        ,
          0.        ,  18.15800

In [11]:
%%capture
BB = misc.imread('charts/prospects/Byron_Buxton.png', mode='RGB')
BB = misc.imresize(BB, size=15)
BB = crop(BB)
BB = np.array([[norm_color(rgb) for rgb in row] for row in BB])
BB = BB.reshape((1, 11349))
Byron_Buxton = get_non_linear_pca(BB)

In [12]:
print Byron_Buxton

[[ 13.42706246  16.4506212    0.          13.31800703   9.55219076   0.
    0.           0.           0.          15.91550319   6.1697937    0.
   15.40650427  14.36542254  16.05562855   0.        ]]


Nonlinear PCA is telling us that the much touted prospect Byron Buxton is most similiar to Ian Kinsler when making on-base hits. In 2016, Ian Kinsler 2016 had a great season, statistically. By WAR, wRC+ and wOBA, it was (at least) one of the best three seasons of his career, while qualifying for a batting title. While I'm not familiar with Ian Kinsler; but scouting reports tells of above average power with pop in his bat and known for extra base lasers (doubles/triples). 

In [13]:
knn = KNeighborsClassifier(n_neighbors=1, weights='distance', algorithm='auto')
X = [x[0] for x in pca_map.values()]
y = player_class.keys()
knn.fit(X, y) 
player_key = knn.predict(Byron_Buxton)
print player_class[player_key[0]]

Ian_Kinsler


Yeah I would agree with this comparison. About the same age and are MVP candidates for their respective leagues (NL for Bryant and AL for Altuve). IMO their offensive skills are very similiar.

In [14]:
Kris_Bryant = pca_map['Kris_Bryant']
temp_pca = pca_map.copy()
del temp_pca['Kris_Bryant']
temp_class = player_class.copy()
del temp_class[0]

knn = KNeighborsClassifier(n_neighbors=1, weights='distance', algorithm='auto')
X = [x[0] for x in temp_pca.values()]
y = temp_class.keys()
knn.fit(X, y) 
player_key = knn.predict(Kris_Bryant)
print player_class[player_key[0]]

Jose_Altuve


## Deeper Jeeper Creepers

More hidden layers, where the "black box" mystique of deep learners come in play. How they get the deep features or what these features even mean, is at best, a deep dark mystery. After some devoted research we can make head and tails of it. Even make a more sophiscated deep learner to figure it out, but then again, we won't know how it got it conclusions. This is the "devil in the details" dilemma, they are always one step ahead of us.

In [15]:
def get_deep_features(data):
    deep_model = theanets.Autoencoder([11349, (16,'relu'), (16,'relu'), 11349])
    deep_model.train([data], algo='rmsprop', input_noise=0.1, hidden_l1=.001, sparsity=0.9, num_updates=1000)
    return deep_model.encode(data)

In [16]:
%%capture
deep_map = {player : get_deep_features(data) 
            for player, data in player_map.iteritems()}

In [17]:
print deep_map

{'Kris_Bryant': array([[  0.        ,  22.41459586,   7.80589698,   0.        ,
          6.30032257,  14.83792958,  20.07810345,   3.79362483,
         13.13476474,   0.        ,   0.        ,   0.        ,
         13.57549222,   0.        ,   0.07717846,  13.78577566]]), 'Mike_Trout': array([[  0.        ,  13.14458909,   0.        ,   0.        ,
          0.        ,  19.7680581 ,   4.62430988,   0.        ,
          8.75754141,   0.        ,   0.        ,   6.34357668,
          0.        ,  25.73185754,   8.59881697,   0.        ]]), 'Brian_Dozier': array([[  0.00000000e+00,   2.39187424e+01,   9.66325230e+00,
          0.00000000e+00,   0.00000000e+00,   1.92130786e+01,
          1.00585593e+01,   0.00000000e+00,   1.90009040e+00,
          4.91911364e+00,   0.00000000e+00,   4.96696719e+00,
          2.12475791e-02,   0.00000000e+00,   7.58874095e+00,
          1.18489112e+01]]), 'Miguel_Cabrera': array([[  0.        ,   5.90910364,   0.        ,   8.38895705,
         12.237

In [18]:
%%capture
Byron_Buxton = get_deep_features(BB)

In [19]:
print Byron_Buxton

[[  6.76782517   9.65330925   0.           0.           3.87504114
   15.06333756   5.51005267   0.          12.11597067   0.           0.
    0.           0.          24.28893543   8.79757084   0.        ]]


I didn't like the nonlinear PCA comparison of Byron Bruxton to Ian Kinsler. I was hoping deep features will give **Mike Trout** as his best comparison. And it did. This is because what you would mostly hear and read from experts is that he most resembles Trout at this stage of his career.   

In [20]:
knn = KNeighborsClassifier(n_neighbors=1, weights='distance', algorithm='auto')
X = [x[0] for x in deep_map.values()]
y = player_class.keys()
knn.fit(X, y) 
player_key = knn.predict(Byron_Buxton)
print player_class[player_key[0]]

Mike_Trout


Even though I like the nonlinear PCA comparisons of Kris Bryant to Jose Altuve. However, they are both in their early stages of their careers. Therefore, likely, they will eventually diverge into completely different players. It would be easy to argue that Kris Bryant will evolve into a **Paul Goldschmidt** type hitter if not better.

In [21]:
Kris_Bryant = deep_map['Kris_Bryant']
temp_deep = deep_map.copy()
del temp_deep['Kris_Bryant']

knn = KNeighborsClassifier(n_neighbors=1, weights='distance', algorithm='auto')
X = [x[0] for x in temp_deep.values()]
y = temp_class.keys()
knn.fit(X, y) 
player_key = knn.predict(Kris_Bryant)
print player_class[player_key[0]]

Paul_Goldschmidt
